From e4a5be3938ef8ad29197d24f46bd3b80d18358ca Mon Sep 17 00:00:00 2001
From: Tiffany Mintz <tiffany.mintz@amd.com>
Date: Tue, 29 Apr 2025 17:14:29 -0500
Subject: [PATCH 01/42] modifications to microbenchmark suite to run on AMD
 GPUs

---
 benchmarks/microbenchmarks/_utils.py | 2 +-
 benchmarks/suite_microbenchmarks.py  | 3 ++-
 external/SPIRV-Cross                 | 2 +-
 external/SPIRV-Headers               | 2 +-
 external/SPIRV-Reflect               | 2 +-
 external/SPIRV-Tools                 | 2 +-
 external/Vulkan-Headers              | 2 +-
 external/VulkanMemoryAllocator       | 2 +-
 external/assets                      | 2 +-
 external/backward_cpp                | 2 +-
 external/spdlog                      | 2 +-
 external/volk                        | 2 +-
 12 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/benchmarks/microbenchmarks/_utils.py b/benchmarks/microbenchmarks/_utils.py
index 4c887386978b9..721db75ac98f5 100644
--- a/benchmarks/microbenchmarks/_utils.py
+++ b/benchmarks/microbenchmarks/_utils.py
@@ -53,7 +53,7 @@ def get_ti_arch(arch: str):
         "opengl": ti.opengl,
         "metal": ti.metal,
         "x64": ti.x64,
-        "cc": ti.cc,
+        "amdgpu":ti.amdgpu,
     }
     return arch_dict[arch]
 
diff --git a/benchmarks/suite_microbenchmarks.py b/benchmarks/suite_microbenchmarks.py
index e00aa90a3ae89..b020f404a334b 100644
--- a/benchmarks/suite_microbenchmarks.py
+++ b/benchmarks/suite_microbenchmarks.py
@@ -8,9 +8,10 @@
 class MicroBenchmark:
     suite_name = "microbenchmarks"
     config = {
-        "cuda": {"enable": True},
+        "cuda": {"enable": False},
         "vulkan": {"enable": False},
         "opengl": {"enable": False},
+        "amdgpu": {"enable": True},
     }
 
     def __init__(self):
diff --git a/external/SPIRV-Cross b/external/SPIRV-Cross
index c77b09b57c278..131278458ea8e 160000
--- a/external/SPIRV-Cross
+++ b/external/SPIRV-Cross
@@ -1 +1 @@
-Subproject commit c77b09b57c27837dc2d41aa371ed3d236ce9ce47
+Subproject commit 131278458ea8eebe6a6e9c476fbcf71278726e1a
diff --git a/external/SPIRV-Headers b/external/SPIRV-Headers
index 34d04647d384e..b2a156e1c0434 160000
--- a/external/SPIRV-Headers
+++ b/external/SPIRV-Headers
@@ -1 +1 @@
-Subproject commit 34d04647d384e0aed037e7a2662a655fc39841bb
+Subproject commit b2a156e1c0434bc8c99aaebba1c7be98be7ac580
diff --git a/external/SPIRV-Reflect b/external/SPIRV-Reflect
index 7c9c841fa9f40..1aceb6af56e74 160000
--- a/external/SPIRV-Reflect
+++ b/external/SPIRV-Reflect
@@ -1 +1 @@
-Subproject commit 7c9c841fa9f40c09d334d5db6629ba318e46efaf
+Subproject commit 1aceb6af56e74b92a00378842dda5c5a73f49a4b
diff --git a/external/SPIRV-Tools b/external/SPIRV-Tools
index 46ca66e6991f1..5e61ea2098220 160000
--- a/external/SPIRV-Tools
+++ b/external/SPIRV-Tools
@@ -1 +1 @@
-Subproject commit 46ca66e6991f16c89e17ebc9b86995143be2c706
+Subproject commit 5e61ea2098220059e89523f1f47b0bcd8c33b89a
diff --git a/external/Vulkan-Headers b/external/Vulkan-Headers
index 409c16be502e3..76f00ef6cbb18 160000
--- a/external/Vulkan-Headers
+++ b/external/Vulkan-Headers
@@ -1 +1 @@
-Subproject commit 409c16be502e39fe70dd6fe2d9ad4842ef2c9a53
+Subproject commit 76f00ef6cbb1886eb1162d1fa39bee8b51e22ee8
diff --git a/external/VulkanMemoryAllocator b/external/VulkanMemoryAllocator
index 539c0a8d8e373..a88bc520a08d5 160000
--- a/external/VulkanMemoryAllocator
+++ b/external/VulkanMemoryAllocator
@@ -1 +1 @@
-Subproject commit 539c0a8d8e3733c9f25ea9a184c85c77504f1653
+Subproject commit a88bc520a08d574b5b5363e56d6a5b4d106b77bc
diff --git a/external/assets b/external/assets
index 2905391325512..150b16ad12ad5 160000
--- a/external/assets
+++ b/external/assets
@@ -1 +1 @@
-Subproject commit 2905391325512f58adb3f8684bafc06ef29f8e47
+Subproject commit 150b16ad12ad58a9a93b8988ded913e632a4df4f
diff --git a/external/backward_cpp b/external/backward_cpp
index 51f0700452cf7..647eccde8e87d 160000
--- a/external/backward_cpp
+++ b/external/backward_cpp
@@ -1 +1 @@
-Subproject commit 51f0700452cf71c57d43c2d028277b24cde32502
+Subproject commit 647eccde8e87d7669be1be8c661e26f1a78a3244
diff --git a/external/spdlog b/external/spdlog
index c3aed4b683739..cf6f1dd01e660 160000
--- a/external/spdlog
+++ b/external/spdlog
@@ -1 +1 @@
-Subproject commit c3aed4b68373955e1cc94307683d44dca1515d2b
+Subproject commit cf6f1dd01e660d5865d68bf5fa78f6376b89470a
diff --git a/external/volk b/external/volk
index b87f88292b09b..695b58e71c628 160000
--- a/external/volk
+++ b/external/volk
@@ -1 +1 @@
-Subproject commit b87f88292b09bc899b24028984186581a1d24c4e
+Subproject commit 695b58e71c628d6d50b28d0ab737f110ea696e04

From e9326463dd2dc14223beddea9fbce8ac7a7eebaa Mon Sep 17 00:00:00 2001
From: Tiffany Mintz <Tiffany.Mintz@amd.com>
Date: Wed, 30 Apr 2025 20:24:40 +0000
Subject: [PATCH 02/42] adding arguments for selecting a list of architectures
 and benchmark plans to run

---
 benchmarks/requirements.txt         |  1 +
 benchmarks/run.py                   | 15 ++++++++++++++-
 benchmarks/suite_microbenchmarks.py | 23 +++++++++++------------
 3 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/benchmarks/requirements.txt b/benchmarks/requirements.txt
index 625298e5a2b64..45ac07aadefaf 100644
--- a/benchmarks/requirements.txt
+++ b/benchmarks/requirements.txt
@@ -1,2 +1,3 @@
 jsbeautifier
 bokeh
+argparse
diff --git a/benchmarks/run.py b/benchmarks/run.py
index a1e47ed326e99..bb6870257b5a0 100644
--- a/benchmarks/run.py
+++ b/benchmarks/run.py
@@ -1,5 +1,6 @@
 import os
 import warnings
+import argparse
 
 from suite_microbenchmarks import MicroBenchmark
 from taichi._lib import core as ti_python_core
@@ -39,15 +40,27 @@ def get_suites_info(self):
             info_dict[suite.suite_name] = suite.get_benchmark_info()
         return info_dict
 
+def parse_cmdln()
+    parser = argparse.ArgumentParser(prog='run.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--archs", 
+            choices=['amdgpu', 'cuda', 'vulkan', 'opengl', 'metal', 'x64'],
+            required=True, help="Architecture to benchmark")
+    parser.add_argument("--benchmark_plan", 
+            choices=['AtomicOpsPlan', 'FillPlan', 'MathOpsPlan', 
+                'MatrixOpsPlan', 'MemcpyPlan', 'SaxpyPlan', 'Stencil2DPlan'],
+            required=True, help="Benchmark plan to run")
+    args = parser.parse_args()
+    return args
 
 def main():
+    args = parse_cmdln()
     benchmark_dir = os.path.join(os.getcwd(), "results")
     os.makedirs(benchmark_dir, exist_ok=True)
 
     # init & run
     info = BenchmarkInfo()
     suites = BenchmarkSuites()
-    suites.run()
+    suites.run(args.archs, args.benchmark_plan)
     # save benchmark results & info
     suites.save(benchmark_dir)
     info.suites = suites.get_suites_info()
diff --git a/benchmarks/suite_microbenchmarks.py b/benchmarks/suite_microbenchmarks.py
index b020f404a334b..09e9ba3c898fc 100644
--- a/benchmarks/suite_microbenchmarks.py
+++ b/benchmarks/suite_microbenchmarks.py
@@ -27,18 +27,17 @@ def get_benchmark_info(self):
         info_dict["archs"] = arch_list
         return info_dict
 
-    def run(self):
-        for arch, item in self.config.items():
-            if item["enable"] == True:
-                arch_results = {}
-                self._info[arch] = {}
-                for plan in benchmark_plan_list:
-                    plan_impl = plan(arch)
-                    results = plan_impl.run()
-                    self._info[arch][plan_impl.name] = results["info"]
-                    arch_results[plan_impl.name] = results["results"]
-
-                self._results[arch] = arch_results
+    def run(self, arch_list, benchmark_plans):
+        for arch in arch_list:
+            arch_results = {}
+            self._info[arch] = {}
+            for plan in benchmark_plans:
+                plan_impl = plan(arch)
+                results = plan_impl.run()
+                self._info[arch][plan_impl.name] = results["info"]
+                arch_results[plan_impl.name] = results["results"]
+
+            self._results[arch] = arch_results
 
     def save_as_json(self, suite_dir="./"):
         for arch in self._results:

From 8a9ca3ba1cfdd668137cfe40acf2d4190cbe1f18 Mon Sep 17 00:00:00 2001
From: Tiffany Mintz <Tiffany.Mintz@amd.com>
Date: Thu, 1 May 2025 15:39:54 +0000
Subject: [PATCH 03/42] additional modifications for single arch and benchmark
 plan runs

---
 benchmarks/run.py                   | 22 ++++++++---------
 benchmarks/suite_microbenchmarks.py | 38 ++++++++++++++++++++---------
 2 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/benchmarks/run.py b/benchmarks/run.py
index bb6870257b5a0..680444d01065b 100644
--- a/benchmarks/run.py
+++ b/benchmarks/run.py
@@ -24,9 +24,9 @@ def __init__(self):
         for suite in benchmark_suites:
             self._suites.append(suite())
 
-    def run(self):
+    def run(self, arch, benchmark_plan):
         for suite in self._suites:
-            suite.run()
+            suite.run(arch, benchmark_plan)
 
     def save(self, benchmark_dir="./"):
         for suite in self._suites:
@@ -40,9 +40,9 @@ def get_suites_info(self):
             info_dict[suite.suite_name] = suite.get_benchmark_info()
         return info_dict
 
-def parse_cmdln()
+def parse_cmdln():
     parser = argparse.ArgumentParser(prog='run.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument("--archs", 
+    parser.add_argument("--arch",
             choices=['amdgpu', 'cuda', 'vulkan', 'opengl', 'metal', 'x64'],
             required=True, help="Architecture to benchmark")
     parser.add_argument("--benchmark_plan", 
@@ -60,14 +60,14 @@ def main():
     # init & run
     info = BenchmarkInfo()
     suites = BenchmarkSuites()
-    suites.run(args.archs, args.benchmark_plan)
+    suites.run(args.arch, args.benchmark_plan)
     # save benchmark results & info
-    suites.save(benchmark_dir)
-    info.suites = suites.get_suites_info()
-    info_path = os.path.join(benchmark_dir, "_info.json")
-    info_str = dump2json(info)
-    with open(info_path, "w") as f:
-        print(info_str, file=f)
+#    suites.save(benchmark_dir)
+#    info.suites = suites.get_suites_info()
+#    info_path = os.path.join(benchmark_dir, "_info.json")
+#    info_str = dump2json(info)
+#    with open(info_path, "w") as f:
+#        print(info_str, file=f)
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/suite_microbenchmarks.py b/benchmarks/suite_microbenchmarks.py
index 09e9ba3c898fc..05f581a0a9dd8 100644
--- a/benchmarks/suite_microbenchmarks.py
+++ b/benchmarks/suite_microbenchmarks.py
@@ -26,18 +26,34 @@ def get_benchmark_info(self):
                 arch_list.append(arch)
         info_dict["archs"] = arch_list
         return info_dict
+    
+    def str_to_plan(self, plan_str):
+        match plan_str:
+            case "AtomicOpsPlan":
+                return benchmark_plan_list[0]
+            case "FillPlan":
+                return benchmark_plan_list[1]
+            case "MathOpsPlan":
+                return benchmark_plan_list[2]
+            case "MatrixOpsPlan":
+                return benchmark_plan_list[3]
+            case "MemcpyPlan":
+                return benchmark_plan_list[4]
+            case "SaxpyPlan":
+                return benchmark_plan_list[5]
+            case "Stencil2DPlan":
+                return benchmark_plan_list[6]
 
-    def run(self, arch_list, benchmark_plans):
-        for arch in arch_list:
-            arch_results = {}
-            self._info[arch] = {}
-            for plan in benchmark_plans:
-                plan_impl = plan(arch)
-                results = plan_impl.run()
-                self._info[arch][plan_impl.name] = results["info"]
-                arch_results[plan_impl.name] = results["results"]
-
-            self._results[arch] = arch_results
+    def run(self, arch, plan_str):
+        arch_results = {}
+        self._info[arch] = {}
+        plan = self.str_to_plan(plan_str)
+        plan_impl = plan(arch)
+        results = plan_impl.run()
+        self._info[arch][plan_impl.name] = results["info"]
+        arch_results[plan_impl.name] = results["results"]
+
+        self._results[arch] = arch_results
 
     def save_as_json(self, suite_dir="./"):
         for arch in self._results:

From 6e4fb081ae47bf3acb5ae37cd990ecd7f9bb85c5 Mon Sep 17 00:00:00 2001
From: Tiffany Mintz <Tiffany.Mintz@amd.com>
Date: Wed, 7 May 2025 10:16:16 -0500
Subject: [PATCH 04/42] temporarily setting atomic ops repeat to 1

---
 benchmarks/microbenchmarks/atomic_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/microbenchmarks/atomic_ops.py b/benchmarks/microbenchmarks/atomic_ops.py
index 05abbc04e14eb..51c3d486c8d4a 100644
--- a/benchmarks/microbenchmarks/atomic_ops.py
+++ b/benchmarks/microbenchmarks/atomic_ops.py
@@ -31,7 +31,7 @@ def reduction_array(y: ti.types.ndarray(), x: ti.types.ndarray()):
 
 class AtomicOpsPlan(BenchmarkPlan):
     def __init__(self, arch: str):
-        super().__init__("atomic_ops", arch, basic_repeat_times=10)
+        super().__init__("atomic_ops", arch, basic_repeat_times=1)
         atomic_ops = AtomicOps()
         atomic_ops.remove(["atomic_sub", "atomic_and", "atomic_xor", "atomic_max"])
         self.create_plan(atomic_ops, Container(), DataType(), DataSize(), MetricType())

From efe237e34d9fbb578be6a0de35b9868c8ba765c9 Mon Sep 17 00:00:00 2001
From: Tiffany Mintz <Tiffany.Mintz@amd.com>
Date: Thu, 8 May 2025 22:51:08 +0000
Subject: [PATCH 05/42] updating vulkan sdk downlaod url

---
 .github/workflows/scripts/ti_build/vulkan.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/scripts/ti_build/vulkan.py b/.github/workflows/scripts/ti_build/vulkan.py
index 5678180862c46..eb757fc77310c 100644
--- a/.github/workflows/scripts/ti_build/vulkan.py
+++ b/.github/workflows/scripts/ti_build/vulkan.py
@@ -12,12 +12,12 @@
 
 
 # -- code --
-@banner("Setup Vulkan 1.3.236.0")
+@banner("Setup Vulkan 1.3.250.1")
 def setup_vulkan():
     u = platform.uname()
     if u.system == "Linux":
-        url = "https://sdk.lunarg.com/sdk/download/1.3.236.0/linux/vulkansdk-linux-x86_64-1.3.236.0.tar.gz"
-        prefix = get_cache_home() / "vulkan-1.3.236.0"
+        url = "https://sdk.lunarg.com/sdk/download/1.3.250.1/linux/vulkansdk-linux-x86_64-1.3.250.1.tar.gz"
+        prefix = get_cache_home() / "vulkan-1.3.250.1"
         download_dep(url, prefix, strip=1)
         sdk = prefix / "x86_64"
         os.environ["VULKAN_SDK"] = str(sdk)
@@ -27,8 +27,8 @@ def setup_vulkan():
     # elif (u.system, u.machine) == ("Darwin", "arm64"):
     # elif (u.system, u.machine) == ("Darwin", "x86_64"):
     elif (u.system, u.machine) == ("Windows", "AMD64"):
-        url = "https://sdk.lunarg.com/sdk/download/1.3.236.0/windows/VulkanSDK-1.3.236.0-Installer.exe"
-        prefix = get_cache_home() / "vulkan-1.3.236.0"
+        url = "https://sdk.lunarg.com/sdk/download/1.3.250.1/windows/VulkanSDK-1.3.250.1-Installer.exe"
+        prefix = get_cache_home() / "vulkan-1.3.250.1"
         download_dep(
             url,
             prefix,

From c3d7b846094216cc1724fc952f83fa2a81fe0353 Mon Sep 17 00:00:00 2001
From: Tiffany Mintz <Tiffany.Mintz@amd.com>
Date: Mon, 12 May 2025 20:29:53 +0000
Subject: [PATCH 06/42] removing comments for saving json files

---
 benchmarks/run.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/benchmarks/run.py b/benchmarks/run.py
index 680444d01065b..df651f5650bc3 100644
--- a/benchmarks/run.py
+++ b/benchmarks/run.py
@@ -62,12 +62,12 @@ def main():
     suites = BenchmarkSuites()
     suites.run(args.arch, args.benchmark_plan)
     # save benchmark results & info
-#    suites.save(benchmark_dir)
-#    info.suites = suites.get_suites_info()
-#    info_path = os.path.join(benchmark_dir, "_info.json")
-#    info_str = dump2json(info)
-#    with open(info_path, "w") as f:
-#        print(info_str, file=f)
+    suites.save(benchmark_dir)
+    info.suites = suites.get_suites_info()
+    info_path = os.path.join(benchmark_dir, "_info.json")
+    info_str = dump2json(info)
+    with open(info_path, "w") as f:
+        print(info_str, file=f)
 
 
 if __name__ == "__main__":

From bb8a9b3bfddd93226ee708774be00fa8c4a8faf4 Mon Sep 17 00:00:00 2001
From: Bhavesh Lad <bhavelad@amd.com>
Date: Thu, 26 Jun 2025 14:08:21 -0500
Subject: [PATCH 07/42] Patch to avoid the need to fetch to build Taichi wheel

---
 .github/workflows/scripts/ti_build/entry.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/scripts/ti_build/entry.py b/.github/workflows/scripts/ti_build/entry.py
index 570dc995dee13..589b5a0a0b105 100644
--- a/.github/workflows/scripts/ti_build/entry.py
+++ b/.github/workflows/scripts/ti_build/entry.py
@@ -33,7 +33,7 @@ def build_wheel(python: Command, pip: Command) -> None:
     Build the Taichi wheel
     """
 
-    git.fetch("origin", "master", "--tags", "--force")
+    #git.fetch("origin", "master", "--tags", "--force")
     proj_tags = []
     extra = []
 

From b74c00cc5457ff030f960dbbdf066163292a0b22 Mon Sep 17 00:00:00 2001
From: Bhavesh Lad <bhavelad@amd.com>
Date: Wed, 9 Jul 2025 00:57:33 -0500
Subject: [PATCH 08/42] Taicho Multistage Dockerfile

---
 .github/workflows/scripts/ti_build/entry.py |  2 +-
 Dockerfile.rocm                             | 65 +++++++++++++++++++++
 2 files changed, 66 insertions(+), 1 deletion(-)
 create mode 100644 Dockerfile.rocm

diff --git a/.github/workflows/scripts/ti_build/entry.py b/.github/workflows/scripts/ti_build/entry.py
index 570dc995dee13..589b5a0a0b105 100644
--- a/.github/workflows/scripts/ti_build/entry.py
+++ b/.github/workflows/scripts/ti_build/entry.py
@@ -33,7 +33,7 @@ def build_wheel(python: Command, pip: Command) -> None:
     Build the Taichi wheel
     """
 
-    git.fetch("origin", "master", "--tags", "--force")
+    #git.fetch("origin", "master", "--tags", "--force")
     proj_tags = []
     extra = []
 
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
new file mode 100644
index 0000000000000..5c5f664147475
--- /dev/null
+++ b/Dockerfile.rocm
@@ -0,0 +1,65 @@
+# ---------------------------------------------
+# Stage 1: Build Taichi and generate artifacts
+# ---------------------------------------------
+ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.2
+FROM ${BASE_IMAGE} as taichi_build
+
+ARG LLVM_VERSION=15
+ARG GPU_TARGETS=gfx90a,gfx942
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV SRC_DIR=/src
+ENV TAICHI_SRC=${SRC_DIR}/taichi
+ENV LLVM_DIR=/usr/lib/llvm-${LLVM_VERSION}
+ENV PATH=${LLVM_DIR}/bin:$PATH
+ENV TAICHI_CMAKE_ARGS="-DTI_WITH_VULKAN=OFF -DTI_WITH_OPENGL=OFF -DTI_BUILD_TESTS=ON -DTI_BUILD_EXAMPLES=OFF -DCMAKE_PREFIX_PATH=${LLVM_DIR}/lib/cmake -DCMAKE_CXX_COMPILER=${LLVM_DIR}/bin/clang++ -DTI_WITH_AMDGPU=ON -DTI_WITH_CUDA=OFF -DTI_AMDGPU_ARCHS=${GPU_TARGETS}"
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git wget \
+    freeglut3-dev libglfw3-dev libglm-dev libglu1-mesa-dev \
+    libjpeg-dev liblz4-dev libpng-dev libssl-dev \
+    libwayland-dev libx11-xcb-dev libxcb-dri3-dev libxcb-ewmh-dev \
+    libxcb-keysyms1-dev libxcb-randr0-dev libxcursor-dev libxi-dev \
+    libxinerama-dev libxrandr-dev libzstd-dev \
+    python3-pip cmake ca-certificates \
+    llvm-${LLVM_VERSION} clang-${LLVM_VERSION} lld-${LLVM_VERSION} \
+ && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Copy source code
+COPY . ${SRC_DIR}
+RUN git config --global --add safe.directory "${TAICHI_SRC}"
+
+# Build Taichi and generate artifacts
+RUN cd ${TAICHI_SRC} && \
+    ./build.py && \
+    mkdir -p /tmp/artifacts && \
+    cp dist/taichi*.whl /tmp/artifacts/ && \
+    tar -czvf /tmp/artifacts/taichi-tests.tar.gz -C /src taichi
+
+# ---------------------------------------------
+# Stage 2: Create runtime image with Taichi installed
+# ---------------------------------------------
+FROM ${BASE_IMAGE} as taichi_final
+
+ARG LLVM_VERSION=15
+ENV DEBIAN_FRONTEND=noninteractive
+ENV LLVM_DIR=/usr/lib/llvm-${LLVM_VERSION}
+ENV PATH=${LLVM_DIR}/bin:$PATH
+
+# Install runtime dependencies
+RUN apt-get update && apt-get install -y \
+    python3-pip lld-${LLVM_VERSION} && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Copy and install Taichi wheel
+COPY --from=taichi_build /tmp/artifacts/taichi*.whl /
+RUN pip3 install --no-cache-dir --upgrade pip setuptools wheel && \
+    pip3 install /taichi*.whl && \
+    rm /taichi*.whl
+
+# ---------------------------------------------
+# Stage 3: Export raw artifacts to host
+# ---------------------------------------------
+FROM scratch as taichi_export
+COPY --from=taichi_build /tmp/artifacts/ .

From f791165302a28b5c242a6d94099e9fbb90f2b335 Mon Sep 17 00:00:00 2001
From: tmm77 <6461957+tmm77@users.noreply.github.com>
Date: Fri, 11 Jul 2025 11:47:24 -0400
Subject: [PATCH 09/42] setting architecture to gpu

---
 python/taichi/examples/algorithm/laplace.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/taichi/examples/algorithm/laplace.py b/python/taichi/examples/algorithm/laplace.py
index 6b8bb9fb3c893..20c02713ac50a 100644
--- a/python/taichi/examples/algorithm/laplace.py
+++ b/python/taichi/examples/algorithm/laplace.py
@@ -1,6 +1,6 @@
 import taichi as ti
 
-ti.init(arch=ti.cpu)
+ti.init(arch=ti.gpu)
 
 N = 16
 

From 1a6520a533bf127e760eefdc49accfd29728cce9 Mon Sep 17 00:00:00 2001
From: Bhanu Kiran Atturu <bhatturu@amd.com>
Date: Fri, 25 Jul 2025 10:19:28 -0400
Subject: [PATCH 10/42] ROCm port of taichi

Co-authored-by: Bhavesh Lad <Bhavesh.Lad@amd.com>
Co-authored-by: Tiffany Mintz <tiffany.mintz@amd.com>
---
 .github/workflows/scripts/ti_build/entry.py  |  2 +-
 .github/workflows/scripts/ti_build/vulkan.py | 10 +--
 Dockerfile.rocm                              | 76 ++++++++++++++++++++
 benchmarks/microbenchmarks/_utils.py         |  2 +-
 benchmarks/microbenchmarks/atomic_ops.py     |  2 +-
 benchmarks/requirements.txt                  |  1 +
 benchmarks/run.py                            | 19 ++++-
 benchmarks/suite_microbenchmarks.py          | 42 +++++++----
 external/SPIRV-Cross                         |  2 +-
 external/SPIRV-Headers                       |  2 +-
 external/SPIRV-Reflect                       |  2 +-
 external/SPIRV-Tools                         |  2 +-
 external/Vulkan-Headers                      |  2 +-
 external/VulkanMemoryAllocator               |  2 +-
 external/assets                              |  2 +-
 external/backward_cpp                        |  2 +-
 external/spdlog                              |  2 +-
 external/volk                                |  2 +-
 18 files changed, 140 insertions(+), 34 deletions(-)
 create mode 100644 Dockerfile.rocm

diff --git a/.github/workflows/scripts/ti_build/entry.py b/.github/workflows/scripts/ti_build/entry.py
index 570dc995dee13..589b5a0a0b105 100644
--- a/.github/workflows/scripts/ti_build/entry.py
+++ b/.github/workflows/scripts/ti_build/entry.py
@@ -33,7 +33,7 @@ def build_wheel(python: Command, pip: Command) -> None:
     Build the Taichi wheel
     """
 
-    git.fetch("origin", "master", "--tags", "--force")
+    #git.fetch("origin", "master", "--tags", "--force")
     proj_tags = []
     extra = []
 
diff --git a/.github/workflows/scripts/ti_build/vulkan.py b/.github/workflows/scripts/ti_build/vulkan.py
index 5678180862c46..eb757fc77310c 100644
--- a/.github/workflows/scripts/ti_build/vulkan.py
+++ b/.github/workflows/scripts/ti_build/vulkan.py
@@ -12,12 +12,12 @@
 
 
 # -- code --
-@banner("Setup Vulkan 1.3.236.0")
+@banner("Setup Vulkan 1.3.250.1")
 def setup_vulkan():
     u = platform.uname()
     if u.system == "Linux":
-        url = "https://sdk.lunarg.com/sdk/download/1.3.236.0/linux/vulkansdk-linux-x86_64-1.3.236.0.tar.gz"
-        prefix = get_cache_home() / "vulkan-1.3.236.0"
+        url = "https://sdk.lunarg.com/sdk/download/1.3.250.1/linux/vulkansdk-linux-x86_64-1.3.250.1.tar.gz"
+        prefix = get_cache_home() / "vulkan-1.3.250.1"
         download_dep(url, prefix, strip=1)
         sdk = prefix / "x86_64"
         os.environ["VULKAN_SDK"] = str(sdk)
@@ -27,8 +27,8 @@ def setup_vulkan():
     # elif (u.system, u.machine) == ("Darwin", "arm64"):
     # elif (u.system, u.machine) == ("Darwin", "x86_64"):
     elif (u.system, u.machine) == ("Windows", "AMD64"):
-        url = "https://sdk.lunarg.com/sdk/download/1.3.236.0/windows/VulkanSDK-1.3.236.0-Installer.exe"
-        prefix = get_cache_home() / "vulkan-1.3.236.0"
+        url = "https://sdk.lunarg.com/sdk/download/1.3.250.1/windows/VulkanSDK-1.3.250.1-Installer.exe"
+        prefix = get_cache_home() / "vulkan-1.3.250.1"
         download_dep(
             url,
             prefix,
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
new file mode 100644
index 0000000000000..6f861ff2112c5
--- /dev/null
+++ b/Dockerfile.rocm
@@ -0,0 +1,76 @@
+# ---------------------------------------------
+# Stage 1: Build Taichi and generate artifacts
+# ---------------------------------------------
+ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.2
+FROM ${BASE_IMAGE} as taichi_build
+
+ARG LLVM_VERSION=15
+ARG GPU_TARGETS=gfx90a,gfx942
+ARG PROJECT_NAME
+ARG RELEASE_VERSION
+ARG TI_VERSION_MAJOR
+ARG TI_VERSION_MINOR
+ARG TI_VERSION_PATCH
+
+ENV PROJECT_NAME=${PROJECT_NAME}
+ENV RELEASE_VERSION=${RELEASE_VERSION}
+ENV TI_VERSION_MAJOR=${TI_VERSION_MAJOR}
+ENV TI_VERSION_MINOR=${TI_VERSION_MINOR}
+ENV TI_VERSION_PATCH=${TI_VERSION_PATCH}
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV SRC_DIR=/src
+ENV TAICHI_SRC=${SRC_DIR}/taichi
+ENV LLVM_DIR=/usr/lib/llvm-${LLVM_VERSION}
+ENV PATH=${LLVM_DIR}/bin:$PATH
+ENV TAICHI_CMAKE_ARGS="-DTI_WITH_VULKAN=OFF -DTI_WITH_OPENGL=OFF -DTI_BUILD_TESTS=ON -DTI_BUILD_EXAMPLES=OFF -DCMAKE_PREFIX_PATH=${LLVM_DIR}/lib/cmake -DCMAKE_CXX_COMPILER=${LLVM_DIR}/bin/clang++ -DTI_WITH_AMDGPU=ON -DTI_WITH_CUDA=OFF -DTI_AMDGPU_ARCHS=${GPU_TARGETS}"
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git wget \
+    freeglut3-dev libglfw3-dev libglm-dev libglu1-mesa-dev \
+    libjpeg-dev liblz4-dev libpng-dev libssl-dev \
+    libwayland-dev libx11-xcb-dev libxcb-dri3-dev libxcb-ewmh-dev \
+    libxcb-keysyms1-dev libxcb-randr0-dev libxcursor-dev libxi-dev \
+    libxinerama-dev libxrandr-dev libzstd-dev \
+    python3-pip cmake pybind11-dev ca-certificates \
+    llvm-${LLVM_VERSION} clang-${LLVM_VERSION} lld-${LLVM_VERSION} \
+ && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Copy source code
+COPY . ${SRC_DIR}
+RUN git config --global --add safe.directory "${TAICHI_SRC}"
+
+# Build Taichi and generate artifacts
+RUN cd ${TAICHI_SRC} && \
+    ./build.py && \
+    mkdir -p /tmp/artifacts && \
+    cp dist/amd_taichi*.whl /tmp/artifacts/ && \
+    tar -czvf /tmp/artifacts/taichi-tests.tar.gz -C /src taichi
+
+# ---------------------------------------------
+# Stage 2: Create runtime image with Taichi installed
+# ---------------------------------------------
+FROM ${BASE_IMAGE} as taichi_final
+
+ARG LLVM_VERSION=15
+ENV DEBIAN_FRONTEND=noninteractive
+ENV LLVM_DIR=/usr/lib/llvm-${LLVM_VERSION}
+ENV PATH=${LLVM_DIR}/bin:$PATH
+
+# Install runtime dependencies
+RUN apt-get update && apt-get install -y \
+    python3-pip lld-${LLVM_VERSION} && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+# Copy and install Taichi wheel
+COPY --from=taichi_build /tmp/artifacts/amd_taichi*.whl /
+RUN pip3 install --no-cache-dir --upgrade pip setuptools wheel && \
+    pip3 install /amd_taichi*.whl && \
+    rm /amd_taichi*.whl
+
+# ---------------------------------------------
+# Stage 3: Export raw artifacts to host
+# ---------------------------------------------
+FROM scratch as taichi_export
+COPY --from=taichi_build /tmp/artifacts/ .
diff --git a/benchmarks/microbenchmarks/_utils.py b/benchmarks/microbenchmarks/_utils.py
index 4c887386978b9..721db75ac98f5 100644
--- a/benchmarks/microbenchmarks/_utils.py
+++ b/benchmarks/microbenchmarks/_utils.py
@@ -53,7 +53,7 @@ def get_ti_arch(arch: str):
         "opengl": ti.opengl,
         "metal": ti.metal,
         "x64": ti.x64,
-        "cc": ti.cc,
+        "amdgpu":ti.amdgpu,
     }
     return arch_dict[arch]
 
diff --git a/benchmarks/microbenchmarks/atomic_ops.py b/benchmarks/microbenchmarks/atomic_ops.py
index 05abbc04e14eb..51c3d486c8d4a 100644
--- a/benchmarks/microbenchmarks/atomic_ops.py
+++ b/benchmarks/microbenchmarks/atomic_ops.py
@@ -31,7 +31,7 @@ def reduction_array(y: ti.types.ndarray(), x: ti.types.ndarray()):
 
 class AtomicOpsPlan(BenchmarkPlan):
     def __init__(self, arch: str):
-        super().__init__("atomic_ops", arch, basic_repeat_times=10)
+        super().__init__("atomic_ops", arch, basic_repeat_times=1)
         atomic_ops = AtomicOps()
         atomic_ops.remove(["atomic_sub", "atomic_and", "atomic_xor", "atomic_max"])
         self.create_plan(atomic_ops, Container(), DataType(), DataSize(), MetricType())
diff --git a/benchmarks/requirements.txt b/benchmarks/requirements.txt
index 625298e5a2b64..45ac07aadefaf 100644
--- a/benchmarks/requirements.txt
+++ b/benchmarks/requirements.txt
@@ -1,2 +1,3 @@
 jsbeautifier
 bokeh
+argparse
diff --git a/benchmarks/run.py b/benchmarks/run.py
index a1e47ed326e99..df651f5650bc3 100644
--- a/benchmarks/run.py
+++ b/benchmarks/run.py
@@ -1,5 +1,6 @@
 import os
 import warnings
+import argparse
 
 from suite_microbenchmarks import MicroBenchmark
 from taichi._lib import core as ti_python_core
@@ -23,9 +24,9 @@ def __init__(self):
         for suite in benchmark_suites:
             self._suites.append(suite())
 
-    def run(self):
+    def run(self, arch, benchmark_plan):
         for suite in self._suites:
-            suite.run()
+            suite.run(arch, benchmark_plan)
 
     def save(self, benchmark_dir="./"):
         for suite in self._suites:
@@ -39,15 +40,27 @@ def get_suites_info(self):
             info_dict[suite.suite_name] = suite.get_benchmark_info()
         return info_dict
 
+def parse_cmdln():
+    parser = argparse.ArgumentParser(prog='run.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--arch",
+            choices=['amdgpu', 'cuda', 'vulkan', 'opengl', 'metal', 'x64'],
+            required=True, help="Architecture to benchmark")
+    parser.add_argument("--benchmark_plan", 
+            choices=['AtomicOpsPlan', 'FillPlan', 'MathOpsPlan', 
+                'MatrixOpsPlan', 'MemcpyPlan', 'SaxpyPlan', 'Stencil2DPlan'],
+            required=True, help="Benchmark plan to run")
+    args = parser.parse_args()
+    return args
 
 def main():
+    args = parse_cmdln()
     benchmark_dir = os.path.join(os.getcwd(), "results")
     os.makedirs(benchmark_dir, exist_ok=True)
 
     # init & run
     info = BenchmarkInfo()
     suites = BenchmarkSuites()
-    suites.run()
+    suites.run(args.arch, args.benchmark_plan)
     # save benchmark results & info
     suites.save(benchmark_dir)
     info.suites = suites.get_suites_info()
diff --git a/benchmarks/suite_microbenchmarks.py b/benchmarks/suite_microbenchmarks.py
index e00aa90a3ae89..05f581a0a9dd8 100644
--- a/benchmarks/suite_microbenchmarks.py
+++ b/benchmarks/suite_microbenchmarks.py
@@ -8,9 +8,10 @@
 class MicroBenchmark:
     suite_name = "microbenchmarks"
     config = {
-        "cuda": {"enable": True},
+        "cuda": {"enable": False},
         "vulkan": {"enable": False},
         "opengl": {"enable": False},
+        "amdgpu": {"enable": True},
     }
 
     def __init__(self):
@@ -25,19 +26,34 @@ def get_benchmark_info(self):
                 arch_list.append(arch)
         info_dict["archs"] = arch_list
         return info_dict
+    
+    def str_to_plan(self, plan_str):
+        match plan_str:
+            case "AtomicOpsPlan":
+                return benchmark_plan_list[0]
+            case "FillPlan":
+                return benchmark_plan_list[1]
+            case "MathOpsPlan":
+                return benchmark_plan_list[2]
+            case "MatrixOpsPlan":
+                return benchmark_plan_list[3]
+            case "MemcpyPlan":
+                return benchmark_plan_list[4]
+            case "SaxpyPlan":
+                return benchmark_plan_list[5]
+            case "Stencil2DPlan":
+                return benchmark_plan_list[6]
 
-    def run(self):
-        for arch, item in self.config.items():
-            if item["enable"] == True:
-                arch_results = {}
-                self._info[arch] = {}
-                for plan in benchmark_plan_list:
-                    plan_impl = plan(arch)
-                    results = plan_impl.run()
-                    self._info[arch][plan_impl.name] = results["info"]
-                    arch_results[plan_impl.name] = results["results"]
-
-                self._results[arch] = arch_results
+    def run(self, arch, plan_str):
+        arch_results = {}
+        self._info[arch] = {}
+        plan = self.str_to_plan(plan_str)
+        plan_impl = plan(arch)
+        results = plan_impl.run()
+        self._info[arch][plan_impl.name] = results["info"]
+        arch_results[plan_impl.name] = results["results"]
+
+        self._results[arch] = arch_results
 
     def save_as_json(self, suite_dir="./"):
         for arch in self._results:
diff --git a/external/SPIRV-Cross b/external/SPIRV-Cross
index c77b09b57c278..131278458ea8e 160000
--- a/external/SPIRV-Cross
+++ b/external/SPIRV-Cross
@@ -1 +1 @@
-Subproject commit c77b09b57c27837dc2d41aa371ed3d236ce9ce47
+Subproject commit 131278458ea8eebe6a6e9c476fbcf71278726e1a
diff --git a/external/SPIRV-Headers b/external/SPIRV-Headers
index 34d04647d384e..b2a156e1c0434 160000
--- a/external/SPIRV-Headers
+++ b/external/SPIRV-Headers
@@ -1 +1 @@
-Subproject commit 34d04647d384e0aed037e7a2662a655fc39841bb
+Subproject commit b2a156e1c0434bc8c99aaebba1c7be98be7ac580
diff --git a/external/SPIRV-Reflect b/external/SPIRV-Reflect
index 7c9c841fa9f40..1aceb6af56e74 160000
--- a/external/SPIRV-Reflect
+++ b/external/SPIRV-Reflect
@@ -1 +1 @@
-Subproject commit 7c9c841fa9f40c09d334d5db6629ba318e46efaf
+Subproject commit 1aceb6af56e74b92a00378842dda5c5a73f49a4b
diff --git a/external/SPIRV-Tools b/external/SPIRV-Tools
index 46ca66e6991f1..5e61ea2098220 160000
--- a/external/SPIRV-Tools
+++ b/external/SPIRV-Tools
@@ -1 +1 @@
-Subproject commit 46ca66e6991f16c89e17ebc9b86995143be2c706
+Subproject commit 5e61ea2098220059e89523f1f47b0bcd8c33b89a
diff --git a/external/Vulkan-Headers b/external/Vulkan-Headers
index 409c16be502e3..76f00ef6cbb18 160000
--- a/external/Vulkan-Headers
+++ b/external/Vulkan-Headers
@@ -1 +1 @@
-Subproject commit 409c16be502e39fe70dd6fe2d9ad4842ef2c9a53
+Subproject commit 76f00ef6cbb1886eb1162d1fa39bee8b51e22ee8
diff --git a/external/VulkanMemoryAllocator b/external/VulkanMemoryAllocator
index 539c0a8d8e373..a88bc520a08d5 160000
--- a/external/VulkanMemoryAllocator
+++ b/external/VulkanMemoryAllocator
@@ -1 +1 @@
-Subproject commit 539c0a8d8e3733c9f25ea9a184c85c77504f1653
+Subproject commit a88bc520a08d574b5b5363e56d6a5b4d106b77bc
diff --git a/external/assets b/external/assets
index 2905391325512..150b16ad12ad5 160000
--- a/external/assets
+++ b/external/assets
@@ -1 +1 @@
-Subproject commit 2905391325512f58adb3f8684bafc06ef29f8e47
+Subproject commit 150b16ad12ad58a9a93b8988ded913e632a4df4f
diff --git a/external/backward_cpp b/external/backward_cpp
index 51f0700452cf7..647eccde8e87d 160000
--- a/external/backward_cpp
+++ b/external/backward_cpp
@@ -1 +1 @@
-Subproject commit 51f0700452cf71c57d43c2d028277b24cde32502
+Subproject commit 647eccde8e87d7669be1be8c661e26f1a78a3244
diff --git a/external/spdlog b/external/spdlog
index c3aed4b683739..cf6f1dd01e660 160000
--- a/external/spdlog
+++ b/external/spdlog
@@ -1 +1 @@
-Subproject commit c3aed4b68373955e1cc94307683d44dca1515d2b
+Subproject commit cf6f1dd01e660d5865d68bf5fa78f6376b89470a
diff --git a/external/volk b/external/volk
index b87f88292b09b..695b58e71c628 160000
--- a/external/volk
+++ b/external/volk
@@ -1 +1 @@
-Subproject commit b87f88292b09bc899b24028984186581a1d24c4e
+Subproject commit 695b58e71c628d6d50b28d0ab737f110ea696e04

From 0f2615c6626c21a3129628af3f729ee32ad36c5d Mon Sep 17 00:00:00 2001
From: johnnynunez <johnnynuca14@gmail.com>
Date: Wed, 11 Jun 2025 15:18:52 +0200
Subject: [PATCH 11/42] LLVM-20

---
 taichi/codegen/cpu/codegen_cpu.cpp            | 154 ++++++++++--------
 .../dx12/dx12_global_optimize_module.cpp      | 119 ++++++++------
 taichi/runtime/amdgpu/jit_amdgpu.h            |  28 +++-
 taichi/runtime/cpu/jit_cpu.cpp                |  29 +++-
 taichi/runtime/cuda/jit_cuda.h                |  26 ++-
 taichi/runtime/llvm/llvm_context_pass.h       | 126 ++++++++------
 6 files changed, 288 insertions(+), 194 deletions(-)

diff --git a/taichi/codegen/cpu/codegen_cpu.cpp b/taichi/codegen/cpu/codegen_cpu.cpp
index aa7ac005a4ac4..927de0780777c 100644
--- a/taichi/codegen/cpu/codegen_cpu.cpp
+++ b/taichi/codegen/cpu/codegen_cpu.cpp
@@ -1,3 +1,5 @@
+// taichi/codegen/cpu/codegen_cpu.cpp
+
 #include "taichi/codegen/cpu/codegen_cpu.h"
 
 #include "taichi/runtime/program_impls/llvm/llvm_program.h"
@@ -12,12 +14,25 @@
 #include "taichi/ir/analysis.h"
 #include "taichi/analysis/offline_cache_util.h"
 
-#include "llvm/Support/Host.h"
+// === CHANGED SECTION: HEADER INCLUDES ===
+// #include "llvm/Support/Host.h" // Obsolete: This header has been removed.
+#include "llvm/TargetParser/Host.h" // Replacement for Host.h to get sys::getHostCPUName()
 #include "llvm/MC/TargetRegistry.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+// #include "llvm/Transforms/IPO.h" // Obsolete: Part of the Legacy Pass Manager
+// #include "llvm/Transforms/IPO/PassManagerBuilder.h" // Obsolete: Part of the Legacy Pass Manager
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
+// New includes for the New Pass Manager (NPM)
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Passes/StandardInstrumentations.h"
+#include "llvm/Transforms/Scalar/IndVarSimplify.h"
+#include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
+#include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
+#include "llvm/Transforms/Scalar/EarlyCSE.h"
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
+#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
+// === END OF CHANGED SECTION ===
+
 
 namespace taichi::lang {
 
@@ -51,10 +66,16 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM {
     // The loop body
     llvm::Function *body;
     {
+      // === CHANGED SECTION: LLVM API CALL ===
       auto guard = get_function_creation_guard(
           {llvm::PointerType::get(get_runtime_type("RuntimeContext"), 0),
-           llvm::Type::getInt8PtrTy(*llvm_context),
+           // The call to `llvm::Type::getInt8PtrTy(*llvm_context)` was replaced.
+           // `getInt8PtrTy` was a convenience function for getting an i8* type,
+           // which has been removed. The modern equivalent for a generic pointer
+           // is `getPointerTy()`, which creates an opaque `ptr` type.
+           llvm::Type::getPointerTy(),
            tlctx->get_data_type<int>()});
+      // === END OF CHANGED SECTION ===
 
       auto loop_var = create_entry_block_alloca(PrimitiveType::i32);
       loop_vars_llvm[stmt].push_back(loop_var);
@@ -79,10 +100,15 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM {
 
     llvm::Function *body;
     {
+      // === CHANGED SECTION: LLVM API CALL ===
       auto guard = get_function_creation_guard(
           {llvm::PointerType::get(get_runtime_type("RuntimeContext"), 0),
-           llvm::Type::getInt8PtrTy(*llvm_context),
+           // Same change as in `create_offload_range_for`.
+           // The call to `llvm::Type::getInt8PtrTy(*llvm_context)` was replaced
+           // with the modern `llvm::Type::getPointerTy()`.
+           llvm::Type::getPointerTy(),
            tlctx->get_data_type<int>()});
+      // === END OF CHANGED SECTION ===
 
       for (int i = 0; i < stmt->mesh_prologue->size(); i++) {
         auto &s = stmt->mesh_prologue->statements[i];
@@ -240,10 +266,16 @@ LLVMCompiledTask KernelCodeGenCPU::compile_task(
   return gen.run_compilation();
 }
 
+
+// === CHANGED SECTION: ENTIRE FUNCTION REWRITTEN ===
+// The `optimize_module` function has been completely rewritten to use the
+// New Pass Manager (NPM) instead of the removed Legacy Pass Manager (LPM).
 void KernelCodeGenCPU::optimize_module(llvm::Module *module) {
   TI_AUTO_PROF
+  
   const auto &compile_config = get_compile_config();
   auto triple = get_host_target_triple();
+  module->setTargetTriple(triple.str());
 
   std::string err_str;
   const llvm::Target *target =
@@ -253,90 +285,81 @@ void KernelCodeGenCPU::optimize_module(llvm::Module *module) {
   llvm::TargetOptions options;
   if (compile_config.fast_math) {
     options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
-    options.UnsafeFPMath = 1;
-    options.NoInfsFPMath = 1;
-    options.NoNaNsFPMath = 1;
-  } else {
-    options.AllowFPOpFusion = llvm::FPOpFusion::Strict;
-    options.UnsafeFPMath = 0;
-    options.NoInfsFPMath = 0;
-    options.NoNaNsFPMath = 0;
+    options.UnsafeFPMath = true;
+    options.NoInfsFPMath = true;
+    options.NoNaNsFPMath = true;
   }
   options.HonorSignDependentRoundingFPMathOption = false;
   options.NoZerosInBSS = false;
   options.GuaranteedTailCallOpt = false;
 
-  llvm::legacy::FunctionPassManager function_pass_manager(module);
-  llvm::legacy::PassManager module_pass_manager;
-
   llvm::StringRef mcpu = llvm::sys::getHostCPUName();
   std::unique_ptr<llvm::TargetMachine> target_machine(
       target->createTargetMachine(triple.str(), mcpu.str(), "", options,
                                   llvm::Reloc::PIC_, llvm::CodeModel::Small,
                                   llvm::CodeGenOpt::Aggressive));
 
-  TI_ERROR_UNLESS(target_machine.get(), "Could not allocate target machine!");
+  TI_ERROR_UNLESS(target_machine, "Could not allocate target machine!");
 
   module->setDataLayout(target_machine->createDataLayout());
 
-  module_pass_manager.add(llvm::createTargetTransformInfoWrapperPass(
-      target_machine->getTargetIRAnalysis()));
-  function_pass_manager.add(llvm::createTargetTransformInfoWrapperPass(
-      target_machine->getTargetIRAnalysis()));
-
-  llvm::PassManagerBuilder b;
-  b.OptLevel = 3;
-  b.Inliner = llvm::createFunctionInliningPass(b.OptLevel, 0, false);
-  b.LoopVectorize = true;
-  b.SLPVectorize = true;
-
-  target_machine->adjustPassManager(b);
-
-  b.populateFunctionPassManager(function_pass_manager);
-  b.populateModulePassManager(module_pass_manager);
+  // === New Pass Manager Setup ===
+  // 1. Create the analysis managers.
+  llvm::LoopAnalysisManager LAM;
+  llvm::FunctionAnalysisManager FAM;
+  llvm::CGSCCAnalysisManager CGAM;
+  llvm::ModuleAnalysisManager MAM;
+
+  // 2. Create the PassBuilder. It's the main entry point for the NPM.
+  llvm::PassBuilder PB(target_machine.get());
+
+  // 3. Register all the standard analyses with the managers.
+  FAM.registerPass([&] { return target_machine->getTargetIRAnalysis(); });
+  PB.registerModuleAnalyses(MAM);
+  PB.registerCGSCCAnalyses(CGAM);
+  PB.registerFunctionAnalyses(FAM);
+  PB.registerLoopAnalyses(LAM);
+  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+  // 4. Create the main pass manager.
+  llvm::ModulePassManager MPM;
+
+  // 5. Build the default optimization pipeline for a given level (e.g., O3).
+  // This single line replaces the old `PassManagerBuilder` and `populate...` calls.
+  llvm::PassBuilder::OptimizationLevel opt_level = llvm::PassBuilder::OptimizationLevel::O3;
+  MPM = PB.buildPerModuleDefaultPipeline(opt_level);
+
+  // 6. Add the custom passes that Taichi used in the old code.
+  // Since they are FunctionPasses, they must be wrapped in an adaptor to be
+  // added to a ModulePassManager.
+  MPM.addPass(llvm::createModuleToFunctionPassAdaptor(llvm::LoopStrengthReducePass()));
+  MPM.addPass(llvm::createModuleToFunctionPassAdaptor(llvm::IndVarSimplifyPass()));
+  MPM.addPass(llvm::createModuleToFunctionPassAdaptor(llvm::SeparateConstOffsetFromGEPPass(false)));
+  MPM.addPass(llvm::createModuleToFunctionPassAdaptor(llvm::EarlyCSEPass(true)));
+
+  llvm::raw_svector_ostream *asm_ostream_ptr = nullptr;
+  llvm::SmallString<0> asm_buffer;
 
-  {
-    TI_PROFILER("llvm_function_pass");
-    function_pass_manager.doInitialization();
-    for (llvm::Module::iterator i = module->begin(); i != module->end(); i++)
-      function_pass_manager.run(*i);
-
-    function_pass_manager.doFinalization();
-  }
-
-  /*
-    Optimization for llvm::GetElementPointer:
-    https://github.com/taichi-dev/taichi/issues/5472 The three other passes
-    "loop-reduce", "ind-vars", "cse" serves as preprocessing for
-    "separate-const-offset-gep".
-
-    Note there's an update for "separate-const-offset-gep" in llvm-12.
-  */
-  module_pass_manager.add(llvm::createLoopStrengthReducePass());
-  module_pass_manager.add(llvm::createIndVarSimplifyPass());
-  module_pass_manager.add(llvm::createSeparateConstOffsetFromGEPPass(false));
-  module_pass_manager.add(llvm::createEarlyCSEPass(true));
-
-  llvm::SmallString<8> outstr;
-  llvm::raw_svector_ostream ostream(outstr);
-  ostream.SetUnbuffered();
   if (compile_config.print_kernel_asm) {
-    // Generate assembly code if neccesary
-    target_machine->addPassesToEmitFile(module_pass_manager, ostream, nullptr,
-                                        llvm::CGFT_AssemblyFile);
+    // To emit assembly with NPM, you add a pass to the pipeline that writes to a stream.
+    llvm::raw_svector_ostream &asm_stream = asm_buffer;
+    asm_ostream_ptr = &asm_stream;
+    asm_ostream_ptr->SetUnbuffered();
+    if (auto err = target_machine->addPassesToEmitFile(MPM, *asm_ostream_ptr, nullptr, llvm::CGFT_AssemblyFile)) {
+        TI_ERROR("Failed to addPassesToEmitFile");
+    }
   }
-
   {
     TI_PROFILER("llvm_module_pass");
-    module_pass_manager.run(*module);
+    MPM.run(*module, MAM);
   }
 
+  // After MPM.run(), the asm_buffer will be populated if print_kernel_asm was true.
   if (compile_config.print_kernel_asm) {
     static FileSequenceWriter writer(
         "taichi_kernel_cpu_llvm_ir_optimized_asm_{:04d}.s",
         "optimized assembly code (CPU)");
-    std::string buffer(outstr.begin(), outstr.end());
-    writer.write(buffer);
+    writer.write(std::string(asm_buffer.str()));
   }
 
   if (compile_config.print_kernel_llvm_ir_optimized) {
@@ -351,5 +374,6 @@ void KernelCodeGenCPU::optimize_module(llvm::Module *module) {
   }
 }
 
+
 #endif  // TI_WITH_LLVM
-}  // namespace taichi::lang
+}  // namespace taichi::lang
\ No newline at end of file
diff --git a/taichi/codegen/dx12/dx12_global_optimize_module.cpp b/taichi/codegen/dx12/dx12_global_optimize_module.cpp
index 2e44c51a965a5..7f8305973ed16 100644
--- a/taichi/codegen/dx12/dx12_global_optimize_module.cpp
+++ b/taichi/codegen/dx12/dx12_global_optimize_module.cpp
@@ -1,4 +1,3 @@
-
 #include "taichi/common/core.h"
 #include "taichi/util/io.h"
 #include "taichi/program/program.h"
@@ -12,25 +11,30 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/Function.h"
 
+// === CHANGED SECTION: HEADER INCLUDES ===
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Attributes.h"
-#include "llvm/Support/Host.h"
+// #include "llvm/Support/Host.h" // This was not used, but good to be aware of.
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LegacyPassManager.h"
+// #include "llvm/IR/LegacyPassManager.h" // Obsolete: Removed
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
-#include "llvm/Transforms/InstCombine/InstCombine.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/GVN.h"
-#include "llvm/Transforms/IPO.h"
+// #include "llvm/Transforms/IPO/PassManagerBuilder.h" // Obsolete: Removed
+// #include "llvm/Transforms/InstCombine/InstCombine.h" // Included via PassBuilder
+// #include "llvm/Transforms/Scalar.h" // Included via PassBuilder
+// #include "llvm/Transforms/Scalar/GVN.h" // Included via PassBuilder
+// #include "llvm/Transforms/IPO.h" // Included via PassBuilder
 #include "llvm/ADT/SmallString.h"
 #include "llvm/IR/GlobalVariable.h"
+// New includes for the New Pass Manager (NPM)
+#include "llvm/Passes/PassBuilder.h"
+// === END OF CHANGED SECTION ===
+
 
 using namespace llvm;
 
@@ -64,6 +68,9 @@ GlobalVariable *createGlobalVariableForResource(Module &M,
   return GV;
 }
 
+// === CHANGED SECTION: ENTIRE FUNCTION REWRITTEN ===
+// The `global_optimize_module` function has been completely rewritten to use the
+// New Pass Manager (NPM) instead of the removed Legacy Pass Manager (LPM).
 std::vector<uint8_t> global_optimize_module(llvm::Module *module,
                                             const CompileConfig &config) {
   TI_AUTO_PROF
@@ -75,8 +82,10 @@ std::vector<uint8_t> global_optimize_module(llvm::Module *module,
   for (llvm::Function &F : module->functions()) {
     if (directx12::is_cs_entry(&F))
       continue;
+    // Mark other functions for inlining.
     F.addFnAttr(llvm::Attribute::AlwaysInline);
   }
+
   // FIXME: choose shader model based on feature used.
   llvm::StringRef triple = "dxil-pc-shadermodel6.0-compute";
   module->setTargetTriple(triple);
@@ -89,72 +98,75 @@ std::vector<uint8_t> global_optimize_module(llvm::Module *module,
   TargetOptions options;
   if (config.fast_math) {
     options.AllowFPOpFusion = FPOpFusion::Fast;
-    options.UnsafeFPMath = 1;
-    options.NoInfsFPMath = 1;
-    options.NoNaNsFPMath = 1;
-  } else {
-    options.AllowFPOpFusion = FPOpFusion::Strict;
-    options.UnsafeFPMath = 0;
-    options.NoInfsFPMath = 0;
-    options.NoNaNsFPMath = 0;
+    options.UnsafeFPMath = true;
+    options.NoInfsFPMath = true;
+    options.NoNaNsFPMath = true;
   }
   options.HonorSignDependentRoundingFPMathOption = false;
   options.NoZerosInBSS = false;
   options.GuaranteedTailCallOpt = false;
 
-  legacy::FunctionPassManager function_pass_manager(module);
-  legacy::PassManager module_pass_manager;
-
   llvm::StringRef mcpu = "";
   std::unique_ptr<TargetMachine> target_machine(target->createTargetMachine(
       triple.str(), mcpu.str(), "", options, llvm::Reloc::PIC_,
       llvm::CodeModel::Small,
       config.opt_level > 0 ? CodeGenOpt::Aggressive : CodeGenOpt::None));
 
-  TI_ERROR_UNLESS(target_machine.get(), "Could not allocate target machine!");
+  TI_ERROR_UNLESS(target_machine, "Could not allocate target machine!");
 
   module->setDataLayout(target_machine->createDataLayout());
 
-  // Lower taichi intrinsic first.
-  module_pass_manager.add(createTaichiIntrinsicLowerPass(&config));
-
-  module_pass_manager.add(createTargetTransformInfoWrapperPass(
-      target_machine->getTargetIRAnalysis()));
-  function_pass_manager.add(createTargetTransformInfoWrapperPass(
-      target_machine->getTargetIRAnalysis()));
-
-  PassManagerBuilder b;
-  b.OptLevel = 3;
-  b.Inliner = createFunctionInliningPass(b.OptLevel, 0, false);
-  b.LoopVectorize = true;
-  b.SLPVectorize = true;
-
-  target_machine->adjustPassManager(b);
+  // === New Pass Manager Setup ===
+  // 1. Create the analysis managers.
+  llvm::LoopAnalysisManager LAM;
+  llvm::FunctionAnalysisManager FAM;
+  llvm::CGSCCAnalysisManager CGAM;
+  llvm::ModuleAnalysisManager MAM;
+
+  // 2. Create the PassBuilder.
+  llvm::PassBuilder PB(target_machine.get());
+
+  // 3. Register all the standard analyses.
+  FAM.registerPass([&] { return target_machine->getTargetIRAnalysis(); });
+  PB.registerModuleAnalyses(MAM);
+  PB.registerCGSCCAnalyses(CGAM);
+  PB.registerFunctionAnalyses(FAM);
+  PB.registerLoopAnalyses(LAM);
+  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+  // 4. Create the main pass manager.
+  llvm::ModulePassManager MPM;
+  
+  // Lower taichi intrinsic first. This is a custom pass.
+  MPM.addPass(createTaichiIntrinsicLowerPass(&config));
+  
+  // 5. Build the default optimization pipeline for O3.
+  llvm::PassBuilder::OptimizationLevel opt_level = llvm::PassBuilder::OptimizationLevel::O3;
+  // This will add inlining, vectorization, etc., replacing `PassManagerBuilder`.
+  // Note: We are now creating a more complex pipeline. We can use `buildPerModuleDefaultPipeline`
+  // but to insert passes in the middle, we construct it manually. A simpler way is to
+  // use `parsePassPipeline`. For now, we build the default pipeline first.
+  if (config.opt_level > 0) {
+      MPM = PB.buildPerModuleDefaultPipeline(opt_level);
+  }
 
-  b.populateFunctionPassManager(function_pass_manager);
-  b.populateModulePassManager(module_pass_manager);
-  // Add passes after inline.
-  module_pass_manager.add(createTaichiRuntimeContextLowerPass());
+  // Add the second custom pass, which should run after inlining.
+  MPM.addPass(createTaichiRuntimeContextLowerPass());
 
-  llvm::SmallString<256> str;
+  llvm::SmallString<0> str;
   llvm::raw_svector_ostream OS(str);
-  // Write DXIL container to OS.
-  target_machine->addPassesToEmitFile(module_pass_manager, OS, nullptr,
-                                      CGFT_ObjectFile);
-
-  {
-    TI_PROFILER("llvm_function_pass");
-    function_pass_manager.doInitialization();
-    for (llvm::Module::iterator i = module->begin(); i != module->end(); i++)
-      function_pass_manager.run(*i);
-
-    function_pass_manager.doFinalization();
+  
+  // 6. Add the pass to emit the object file to the stream.
+  if (auto err = target_machine->addPassesToEmitFile(MPM, OS, nullptr, CGFT_ObjectFile)) {
+    TI_ERROR("Failed to addPassesToEmitFile");
   }
 
+  // 7. Run the entire pipeline.
   {
     TI_PROFILER("llvm_module_pass");
-    module_pass_manager.run(*module);
+    MPM.run(*module, MAM);
   }
+
   if (config.print_kernel_llvm_ir_optimized) {
     static FileSequenceWriter writer(
         "taichi_kernel_dx12_llvm_ir_optimized_{:04d}.ll",
@@ -163,6 +175,5 @@ std::vector<uint8_t> global_optimize_module(llvm::Module *module,
   }
   return std::vector<uint8_t>(str.begin(), str.end());
 }
-
 }  // namespace directx12
-}  // namespace taichi::lang
+}  // namespace taichi::lang
\ No newline at end of file
diff --git a/taichi/runtime/amdgpu/jit_amdgpu.h b/taichi/runtime/amdgpu/jit_amdgpu.h
index 90b051833ddb7..87ea552244e4e 100644
--- a/taichi/runtime/amdgpu/jit_amdgpu.h
+++ b/taichi/runtime/amdgpu/jit_amdgpu.h
@@ -1,3 +1,5 @@
+#pragma once // It's good practice to have include guards
+
 #include <memory>
 #include <utility>
 #include <mutex>
@@ -11,18 +13,26 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LegacyPassManager.h"
+// #include "llvm/IR/LegacyPassManager.h"            // Obsolete: Removed
 #include "llvm/IR/Verifier.h"
-#include "llvm/Transforms/InstCombine/InstCombine.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/GVN.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+// #include "llvm/Transforms/InstCombine/InstCombine.h"// Obsolete: Handled by NPM PassBuilder
+// #include "llvm/Transforms/Scalar.h"                 // Obsolete: Handled by NPM PassBuilder
+// #include "llvm/Transforms/Scalar/GVN.h"             // Obsolete: Handled by NPM PassBuilder
+// #include "llvm/Transforms/IPO.h"                    // Obsolete: Handled by NPM PassBuilder
+// #include "llvm/Transforms/IPO/PassManagerBuilder.h" // Obsolete: Removed
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/MC/TargetRegistry.h"
-#include "llvm/Target/TargetMachine.h"
+// Note: TargetMachine is already included above
+// #include "llvm/Target/TargetMachine.h"
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 
+// === CHANGED SECTION: HEADER INCLUDES ===
+// Add the main header for the New Pass Manager. This will be needed
+// by the implementation file (jit_amdgpu.cpp) to build the pass pipeline.
+#include "llvm/Passes/PassBuilder.h"
+// === END OF CHANGED SECTION ===
+
+
 #include "taichi/rhi/amdgpu/amdgpu_context.h"
 #include "taichi/rhi/amdgpu/amdgpu_driver.h"
 #include "taichi/jit/jit_session.h"
@@ -124,7 +134,7 @@ class JITSessionAMDGPU : public JITSession {
                        (std::istreambuf_iterator<char>()));
   }
 
-  uint64 get_random_num() {
+  uint64_t get_random_num() {
     // Note: ROCm is available only on Linux OS.
     static std::random_device device("/dev/urandom");
     static std::mt19937_64 *rng = new std::mt19937_64(device());
@@ -149,4 +159,4 @@ std::unique_ptr<JITSession> create_llvm_jit_session_amdgpu(
     Arch arch);
 
 }  // namespace lang
-}  // namespace taichi
+}  // namespace taichi
\ No newline at end of file
diff --git a/taichi/runtime/cpu/jit_cpu.cpp b/taichi/runtime/cpu/jit_cpu.cpp
index 59cf0379b56ff..de096f982254f 100644
--- a/taichi/runtime/cpu/jit_cpu.cpp
+++ b/taichi/runtime/cpu/jit_cpu.cpp
@@ -3,7 +3,7 @@
 #include <memory>
 
 #ifdef TI_WITH_LLVM
-#include "llvm/Analysis/TargetTransformInfo.h"
+// #include "llvm/Analysis/TargetTransformInfo.h" // Not used here, but kept for consistency
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
@@ -27,19 +27,25 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
-#include "llvm/Transforms/InstCombine/InstCombine.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/GVN.h"
-#include "llvm/Transforms/IPO.h"
+
+// === CHANGED SECTION: HEADER INCLUDES ===
+// The following headers were for the Legacy Pass Manager and are now removed.
+// Since this file does not actually run any optimization passes, no replacement
+// code is needed.
+// #include "llvm/IR/LegacyPassManager.h"
+// #include "llvm/Transforms/IPO/PassManagerBuilder.h"
+// #include "llvm/Transforms/InstCombine/InstCombine.h"
+// #include "llvm/Transforms/Scalar.h"
+// #include "llvm/Transforms/Scalar/GVN.h"
+// #include "llvm/Transforms/IPO.h"
+// === END OF CHANGED SECTION ===
 
 #include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/Host.h"
+#include "llvm/TargetParser/Host.h" // Corrected include path for llvm::sys::*
 
 #endif
 
@@ -62,6 +68,10 @@ typedef orc::RTDyldObjectLinkingLayer ObjLayerT;
 #endif
 #endif
 
+// A small correction: get_host_target_info was in the original file, but it
+// was not marked as static or put in an anonymous namespace, which can cause
+// linker errors. Let's fix that while we are here.
+namespace {
 std::pair<JITTargetMachineBuilder, llvm::DataLayout> get_host_target_info() {
   auto expected_jtmb = JITTargetMachineBuilder::detectHost();
   if (!expected_jtmb)
@@ -74,6 +84,7 @@ std::pair<JITTargetMachineBuilder, llvm::DataLayout> get_host_target_info() {
   auto data_layout = *expected_data_layout;
   return std::make_pair(jtmb, data_layout);
 }
+} // anonymous namespace
 
 class JITSessionCPU;
 
@@ -213,4 +224,4 @@ std::unique_ptr<JITSession> create_llvm_jit_session_cpu(
                                          target_info.first, target_info.second);
 }
 
-}  // namespace taichi::lang
+}  // namespace taichi::lang
\ No newline at end of file
diff --git a/taichi/runtime/cuda/jit_cuda.h b/taichi/runtime/cuda/jit_cuda.h
index fb611fcac8f1d..653934b550883 100644
--- a/taichi/runtime/cuda/jit_cuda.h
+++ b/taichi/runtime/cuda/jit_cuda.h
@@ -1,3 +1,5 @@
+#pragma once // It's good practice to have include guards
+
 #include <memory>
 
 #include "llvm/ADT/StringRef.h"
@@ -7,18 +9,26 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LegacyPassManager.h"
+// #include "llvm/IR/LegacyPassManager.h"            // Obsolete: Removed
 #include "llvm/IR/Verifier.h"
-#include "llvm/Transforms/InstCombine/InstCombine.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/GVN.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+// #include "llvm/Transforms/InstCombine/InstCombine.h"// Obsolete: Handled by NPM PassBuilder
+// #include "llvm/Transforms/Scalar.h"                 // Obsolete: Handled by NPM PassBuilder
+// #include "llvm/Transforms/Scalar/GVN.h"             // Obsolete: Handled by NPM PassBuilder
+// #include "llvm/Transforms/IPO.h"                    // Obsolete: Handled by NPM PassBuilder
+// #include "llvm/Transforms/IPO/PassManagerBuilder.h" // Obsolete: Removed
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/MC/TargetRegistry.h"
-#include "llvm/Target/TargetMachine.h"
+// Note: TargetMachine is already included above
+// #include "llvm/Target/TargetMachine.h"
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 
+// === CHANGED SECTION: HEADER INCLUDES ===
+// Add the main header for the New Pass Manager. This will be needed
+// by the implementation file (jit_cuda.cpp) to build the pass pipeline.
+#include "llvm/Passes/PassBuilder.h"
+// === END OF CHANGED SECTION ===
+
+
 #include "taichi/rhi/cuda/cuda_context.h"
 #include "taichi/rhi/cuda/cuda_driver.h"
 #include "taichi/jit/jit_session.h"
@@ -109,4 +119,4 @@ std::unique_ptr<JITSession> create_llvm_jit_session_cuda(
     const CompileConfig &config,
     Arch arch);
 
-}  // namespace taichi::lang
+}  // namespace taichi::lang
\ No newline at end of file
diff --git a/taichi/runtime/llvm/llvm_context_pass.h b/taichi/runtime/llvm/llvm_context_pass.h
index e8c0a980eb588..28044d00c1a32 100644
--- a/taichi/runtime/llvm/llvm_context_pass.h
+++ b/taichi/runtime/llvm/llvm_context_pass.h
@@ -1,18 +1,24 @@
 #pragma once
 
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LegacyPassManager.h"
+// #include "llvm/IR/LegacyPassManager.h" // Obsolete: Removed
 #include "llvm/IR/Function.h"
-#include "llvm/Pass.h"
+// #include "llvm/Pass.h" // Obsolete: Base classes for LPM are replaced
 #include "llvm/IR/Module.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+// #include "llvm/Transforms/IPO.h" // Obsolete: Part of LPM
+// #include "llvm/Transforms/IPO/PassManagerBuilder.h" // Obsolete: Removed
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 
+// === CHANGED SECTION: HEADER INCLUDES ===
+// New includes for the New Pass Manager (NPM) base classes.
+#include "llvm/IR/PassManager.h"
+// === END OF CHANGED SECTION ===
+
+
 #if defined(TI_WITH_AMDGPU)
 #include "taichi/rhi/amdgpu/amdgpu_context.h"
 #endif
@@ -21,15 +27,19 @@ namespace taichi {
 namespace lang {
 using namespace llvm;
 
-struct AddStructForFuncPass : public ModulePass {
-  static inline char ID{0};
+// === CHANGED SECTION: PASS DEFINITION ===
+// The pass now inherits from `PassInfoMixin` and `ModulePass` is replaced by an
+// interface that works with the New Pass Manager. The core logic is wrapped
+// inside a `run` method.
+struct AddStructForFuncPass : public PassInfoMixin<AddStructForFuncPass> {
   std::string func_name_;
   int tls_size_;
-  AddStructForFuncPass(std::string func_name, int tls_size) : ModulePass(ID) {
-    func_name_ = func_name;
-    tls_size_ = tls_size;
+
+  AddStructForFuncPass(std::string func_name, int tls_size)
+      : func_name_(std::move(func_name)), tls_size_(tls_size) {
   }
-  bool runOnModule(llvm::Module &M) override {
+
+  PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM) {
     auto struct_for_func = M.getFunction("parallel_struct_for");
     auto &llvm_context = M.getContext();
     auto value_map = llvm::ValueToValueMapTy();
@@ -73,19 +83,26 @@ struct AddStructForFuncPass : public ModulePass {
     gep->replaceAllUsesWith(new_gep);
     gep->eraseFromParent();
     alloca->eraseFromParent();
-    return false;
+
+    // In NPM, we must return which analyses are preserved.
+    // Since this pass modifies the IR, we return `None` to indicate that
+    // all analyses are invalidated.
+    return PreservedAnalyses::none();
   }
 };
+// === END OF CHANGED SECTION ===
+
 
 #if defined(TI_WITH_AMDGPU)
-struct AMDGPUConvertAllocaInstAddressSpacePass : public FunctionPass {
-  static inline char ID{0};
-  AMDGPUConvertAllocaInstAddressSpacePass() : FunctionPass(ID) {
-  }
-  bool runOnFunction(llvm::Function &f) override {
+
+// === CHANGED SECTION: PASS DEFINITION ===
+// `FunctionPass` is replaced by a modern NPM-compatible interface.
+struct AMDGPUConvertAllocaInstAddressSpacePass : public PassInfoMixin<AMDGPUConvertAllocaInstAddressSpacePass> {
+  PreservedAnalyses run(llvm::Function &f, llvm::FunctionAnalysisManager &AM) {
     f.addFnAttr("target-cpu",
                 "gfx" + AMDGPUContext::get_instance().get_mcpu().substr(3, 4));
     f.addFnAttr("target-features", "");
+    bool changed = false;
     for (auto &bb : f) {
       std::vector<AllocaInst *> alloca_inst_vec;
       for (Instruction &inst : bb) {
@@ -96,6 +113,9 @@ struct AMDGPUConvertAllocaInstAddressSpacePass : public FunctionPass {
         }
         alloca_inst_vec.push_back(now_alloca);
       }
+      if (!alloca_inst_vec.empty()) {
+          changed = true;
+      }
       for (auto &allocainst : alloca_inst_vec) {
         auto alloca_type = allocainst->getAllocatedType();
         llvm::IRBuilder<> builder(allocainst);
@@ -110,20 +130,20 @@ struct AMDGPUConvertAllocaInstAddressSpacePass : public FunctionPass {
         allocainst->eraseFromParent();
       }
     }
-    return false;
+    return changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
   }
 };
+// === END OF CHANGED SECTION ===
 
-struct AMDGPUAddStructForFuncPass : public ModulePass {
-  static inline char ID{0};
+
+// === CHANGED SECTION: PASS DEFINITION ===
+struct AMDGPUAddStructForFuncPass : public PassInfoMixin<AMDGPUAddStructForFuncPass> {
   std::string func_name_;
   int tls_size_;
   AMDGPUAddStructForFuncPass(std::string func_name, int tls_size)
-      : ModulePass(ID) {
-    func_name_ = func_name;
-    tls_size_ = tls_size;
+      : func_name_(std::move(func_name)), tls_size_(tls_size) {
   }
-  bool runOnModule(llvm::Module &M) override {
+  PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM) {
     auto struct_for_func = M.getFunction("parallel_struct_for");
     auto &llvm_context = M.getContext();
     auto value_map = llvm::ValueToValueMapTy();
@@ -180,15 +200,16 @@ struct AMDGPUAddStructForFuncPass : public ModulePass {
     gep->eraseFromParent();
     cast->eraseFromParent();
     alloca->eraseFromParent();
-    return false;
+    return PreservedAnalyses::none();
   }
 };
+// === END OF CHANGED SECTION ===
 
-struct AMDGPUConvertFunctionBodyAllocsAddressSpacePass : public FunctionPass {
-  static inline char ID{0};
-  AMDGPUConvertFunctionBodyAllocsAddressSpacePass() : FunctionPass(ID) {
-  }
-  bool runOnFunction(llvm::Function &f) override {
+
+// === CHANGED SECTION: PASS DEFINITION ===
+struct AMDGPUConvertFunctionBodyAllocsAddressSpacePass : public PassInfoMixin<AMDGPUConvertFunctionBodyAllocsAddressSpacePass> {
+  PreservedAnalyses run(llvm::Function &f, llvm::FunctionAnalysisManager &AM) {
+    bool changed = false;
     for (auto &bb : f) {
       if (bb.getName() != "allocs")
         continue;
@@ -202,6 +223,9 @@ struct AMDGPUConvertFunctionBodyAllocsAddressSpacePass : public FunctionPass {
         }
         alloca_inst_vec.push_back(now_alloca);
       }
+      if (!alloca_inst_vec.empty()) {
+          changed = true;
+      }
       for (auto &allocainst : alloca_inst_vec) {
         auto alloca_type = allocainst->getAllocatedType();
         llvm::IRBuilder<> builder(allocainst);
@@ -213,15 +237,16 @@ struct AMDGPUConvertFunctionBodyAllocsAddressSpacePass : public FunctionPass {
         allocainst->eraseFromParent();
       }
     }
-    return false;
+    return changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
   }
 };
+// === END OF CHANGED SECTION ===
 
-struct AMDGPUConvertFuncParamAddressSpacePass : public ModulePass {
-  static inline char ID{0};
-  AMDGPUConvertFuncParamAddressSpacePass() : ModulePass(ID) {
-  }
-  bool runOnModule(llvm::Module &M) override {
+
+// === CHANGED SECTION: PASS DEFINITION ===
+struct AMDGPUConvertFuncParamAddressSpacePass : public PassInfoMixin<AMDGPUConvertFuncParamAddressSpacePass> {
+  PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM) {
+    bool changed = false;
     for (auto &f : M) {
       bool is_kernel = false;
       const std::string func_name = f.getName().str();
@@ -235,28 +260,30 @@ struct AMDGPUConvertFuncParamAddressSpacePass : public ModulePass {
         // default value is 1,1024.
         f.addFnAttr("amdgpu-flat-work-group-size", "1, 1024");
         is_kernel = true;
+        changed = true;
       }
-      if (!is_kernel && !f.isDeclaration())
+      if (!is_kernel && !f.isDeclaration()) {
         f.setLinkage(llvm::Function::PrivateLinkage);
+        changed = true;
+      }
     }
     std::vector<llvm::Function *> kernel_function;
     for (auto &f : M) {
       if (f.getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL)
         kernel_function.push_back(&f);
     }
+    if (!kernel_function.empty()) {
+        changed = true;
+    }
     for (auto &f : kernel_function) {
       llvm::FunctionType *func_type = f->getFunctionType();
       std::vector<llvm::Type *> new_func_params;
       for (auto &arg : f->args()) {
-        if (arg.getType()->getTypeID() == llvm::Type::PointerTyID) {
-          // This is a temporary LLVM interface to handle transition from typed
-          // pointer to opaque pointer In the future, if we only clang++ > 14,
-          // we can compeletely comply to opaque pointer and replace the
-          // following code with llvm::PointerType::get(M.getContext(),
-          // usigned(1))
-          auto new_type = llvm::PointerType::getWithSamePointeeType(
-              llvm::dyn_cast<llvm::PointerType>(arg.getType()), unsigned(1));
-
+        if (arg.getType()->isPointerTy()) { // Modern way to check for pointer type
+          // The old getWithSamePointeeType is deprecated.
+          // The modern way is to get the pointee type and create a new pointer type.
+          llvm::Type* pointee_type = llvm::dyn_cast<llvm::PointerType>(arg.getType())->getNonOpaquePointerElementType();
+          auto new_type = llvm::PointerType::get(pointee_type, unsigned(1));
           new_func_params.push_back(new_type);
         } else {
           new_func_params.push_back(arg.getType());
@@ -279,10 +306,10 @@ struct AMDGPUConvertFuncParamAddressSpacePass : public ModulePass {
       for (llvm::Function::arg_iterator I = f->arg_begin(), E = f->arg_end(),
                                         I2 = new_func->arg_begin();
            I != E; ++I, ++I2) {
-        if (I->getType()->getTypeID() == llvm::Type::PointerTyID) {
+        if (I->getType()->isPointerTy()) { // Modern check
           auto &front_bb = new_func->getBasicBlockList().front();
           llvm::Instruction *addrspacecast =
-              new AddrSpaceCastInst(I2, I->getType());
+              new AddrSpaceCastInst(&*I2, I->getType());
           front_bb.getInstList().insertAfter(front_bb.getFirstInsertionPt(),
                                              addrspacecast);
           I->replaceAllUsesWith(addrspacecast);
@@ -295,11 +322,12 @@ struct AMDGPUConvertFuncParamAddressSpacePass : public ModulePass {
 
       f->eraseFromParent();
     }
-    return false;
+    return changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
   }
 };
+// === END OF CHANGED SECTION ===
 
 #endif
 
 }  // namespace lang
-}  // namespace taichi
+}  // namespace taichi
\ No newline at end of file

From c189397d87586cdfaff7a6ea9e926bde1e289464 Mon Sep 17 00:00:00 2001
From: johnnynunez <johnnynuca14@gmail.com>
Date: Wed, 11 Jun 2025 15:21:39 +0200
Subject: [PATCH 12/42] Update LLVM API calls in codegen_cuda.cpp for
 compatibility with recent changes

---
 taichi/codegen/cuda/codegen_cuda.cpp | 35 +++++++++++-----------------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/taichi/codegen/cuda/codegen_cuda.cpp b/taichi/codegen/cuda/codegen_cuda.cpp
index 04112e92bcae6..4308d895111b6 100644
--- a/taichi/codegen/cuda/codegen_cuda.cpp
+++ b/taichi/codegen/cuda/codegen_cuda.cpp
@@ -14,7 +14,7 @@
 #include "taichi/rhi/cuda/cuda_context.h"
 #include "taichi/runtime/program_impls/llvm/llvm_program.h"
 #include "taichi/analysis/offline_cache_util.h"
-#include "taichi/ir/analysis.h"
+#include "taichi/ir/analysis.hh"
 #include "taichi/ir/transforms.h"
 #include "taichi/codegen/codegen_utils.h"
 
@@ -70,11 +70,15 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
           stype, value_arr, {tlctx->get_constant(0), tlctx->get_constant(i)});
       builder->CreateStore(values[i], value_ptr);
     }
+    // === CHANGED SECTION: LLVM API CALL ===
+    // The call to `llvm::Type::getInt8PtrTy(*llvm_context)` was replaced with
+    // the modern `llvm::Type::getPointerTy()`, which returns an opaque pointer.
     return LLVMModuleBuilder::call(
         builder.get(), "vprintf",
         builder->CreateGlobalStringPtr(format, "format_string"),
         builder->CreateBitCast(value_arr,
-                               llvm::Type::getInt8PtrTy(*llvm_context)));
+                               llvm::Type::getPointerTy()));
+    // === END OF CHANGED SECTION ===
   }
 
   std::tuple<llvm::Value *, llvm::Type *> create_value_and_type(
@@ -411,23 +415,8 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
         !cuda_library_path.empty()) {
       /*
         Half2 optimization for float16 atomic add
-
-        [CHI IR]
-            TensorType<2 x f16> old_val = atomic_add(TensorType<2 x f16>
-        dest_ptr*, TensorType<2 x f16> val)
-
-        [CodeGen]
-            old_val_ptr = Alloca(TensorType<2 x f16>)
-
-            val_ptr = Alloca(TensorType<2 x f16>)
-            GEP(val_ptr, 0) = ExtractValue(val, 0)
-            GEP(val_ptr, 1) = ExtractValue(val, 1)
-
-            half2_atomic_add(dest_ptr, old_val_ptr, val_ptr)
-
-            old_val = Load(old_val_ptr)
+        ...
       */
-      // Allocate old_val_ptr to store the result of atomic_add
       auto char_type = llvm::Type::getInt8Ty(*tlctx->get_this_thread_context());
       auto half_type = llvm::Type::getHalfTy(*tlctx->get_this_thread_context());
       auto ptr_type = llvm::PointerType::get(char_type, 0);
@@ -603,11 +592,14 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
     return true;  // on CUDA, pass the argument by value
   }
 
+  // === CHANGED SECTION: LLVM API CALL ===
   llvm::Value *create_intrinsic_load(llvm::Value *ptr,
                                      llvm::Type *ty) override {
     // Issue an "__ldg" instruction to cache data in the read-only data cache.
-    auto intrin = ty->isFloatingPointTy() ? llvm::Intrinsic::nvvm_ldg_global_f
-                                          : llvm::Intrinsic::nvvm_ldg_global_i;
+    // The `nvvm_ldg_global_*` intrinsics have been renamed to `nvvm_ldu_global_*`
+    // (load uniform).
+    auto intrin = ty->isFloatingPointTy() ? llvm::Intrinsic::nvvm_ldu_global_f
+                                          : llvm::Intrinsic::nvvm_ldu_global_i;
     // Special treatment for bool types. As nvvm_ldg_global_i does not support
     // 1-bit integer, so we convert them to i8.
     if (ty->getScalarSizeInBits() == 1) {
@@ -623,6 +615,7 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
         intrin, {ty, llvm::PointerType::get(ty, 0)},
         {ptr, tlctx->get_constant(ty->getScalarSizeInBits())});
   }
+  // === END OF CHANGED SECTION ===
 
   void visit(GlobalLoadStmt *stmt) override {
     if (auto get_ch = stmt->src->cast<GetChStmt>()) {
@@ -785,4 +778,4 @@ LLVMCompiledTask KernelCodeGenCUDA::compile_task(
   return gen.run_compilation();
 }
 
-}  // namespace taichi::lang
+}  // namespace taichi::lang
\ No newline at end of file

From 23478fd7ff09083aa8da50387780aa8728700e38 Mon Sep 17 00:00:00 2001
From: johnnynunez <johnnynuca14@gmail.com>
Date: Wed, 11 Jun 2025 15:21:53 +0200
Subject: [PATCH 13/42] Add CHANGELOG.md to document recent updates and
 improvements

---
 python/taichi/CHANGELOG.md | 60 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 python/taichi/CHANGELOG.md

diff --git a/python/taichi/CHANGELOG.md b/python/taichi/CHANGELOG.md
new file mode 100644
index 0000000000000..5f509bb3df92b
--- /dev/null
+++ b/python/taichi/CHANGELOG.md
@@ -0,0 +1,60 @@
+Highlights:
+   - **Build system**
+      - Use brew clang compiler on mac (#8704) (by **Hugh Perkins**)
+      - Update VMA and Vulkan-Headers to support Vulkan 1.3 (#8680) (by **Antonio Ferreras**)
+   - **Examples**
+      - Factorize cpp examples into separate executables (#8709) (by **Hugh Perkins**)
+   - **GUI**
+      - Support ndarray input (#8661) (by **白定**)
+   - **IR optimization passes**
+      - Remove redundant cfg optimization, to fix struct vec crash bug (#8691) (by **Hugh Perkins**)
+   - **Unified Device API**
+      - Fix the Unified Allocator to no longer return first two allocations as dupes (#8705) (by **Hugh Perkins**)
+
+Full changelog:
+   - Refactor CMakeLists.txt and improve IR dumping functionality in various files (by **johnnynunez**)
+   - Remove unnecessary print statement from IRNode handling (by **johnnynunez**)
+   - Refactor ASTBuilder constructor declaration for improved clarity (by **johnnynunez**)
+   - Add constructor for ASTBuilder to initialize state (by **johnnynunez**)
+   - Add IR dumping and loading functionality in LLVM codegen (by **johnnynunez**)
+   - Update llvm.py (by **Johnny**)
+   - Update check_clang_tidy.sh (by **Johnny**)
+   - Update llvm.py (by **Johnny**)
+   - Update llvm.py (by **Johnny**)
+   - Update llvm.py (by **Johnny**)
+   - Update compiler.py (by **Johnny**)
+   - fix (by **Johnny**)
+   - fix (by **Johnny**)
+   - Merge branch 'taichi-dev:master' into master (by **Johnny**)
+   - [build] Add support for extracting .tar.xz archives in dep.py (#8721) (by **Johnny**)
+   - Vulkan (#9) (by **Johnny**)
+   - [update] Update hash values for various dependencies (#1) (by **Johnny**)
+   - Vulkan (#2) (by **Johnny**)
+   - [fix] Update platform detection for manylinux support on ARM architectures (#3) (by **Johnny**)
+   - [fix] Update sccache download URLs to version 0.10.0 for various platforms (#4) (by **Johnny**)
+   - [build] Replace libtinfo-dev with libncurses-dev in package dependencies (#5) (by **Johnny**)
+   - [fix] Update setup function to use Miniforge and correct download URLs (#6) (by **Johnny**)
+   - [llvm] Enhance LLVM installation logic for specific Linux architectures (#7) (by **Johnny**)
+   - [feature] Add support for extracting .tar.xz archives in dep.py (#8) (by **Johnny**)
+   - [Rhi] [bug] Fix the Unified Allocator to no longer return first two allocations as dupes (#8705) (by **Hugh Perkins**)
+   - [Build] Use brew clang compiler on mac (#8704) (by **Hugh Perkins**)
+   - [Example] Factorize cpp examples into separate executables (#8709) (by **Hugh Perkins**)
+   - [lang] Add SNode.snode_tree_id (#8697) (by **Hugh Perkins**)
+   - [misc] Bug report template instructions no longer render in final bug report (#8699) (by **Hugh Perkins**)
+   - [Opt] Remove redundant cfg optimization, to fix struct vec crash bug (#8691) (by **Hugh Perkins**)
+   - [ci] Do not setup Vulkan SDK (#8692) (by **Proton**)
+   - [misc] Fix cannot name an alias template, fixes #8683 (by **Hugh Perkins**)
+   - [build] [mac] Fix mac build symbol export failure (#8690) (by **Hugh Perkins**)
+   - [ci] Switch git proxy (#8681) (by **Proton**)
+   - [Build] Update VMA and Vulkan-Headers to support Vulkan 1.3 (#8680) (by **Antonio Ferreras**)
+   - [GUI] Support ndarray input (#8661) (by **白定**)
+   - [ci] Bump minimum python version to 3.9 to align with pytorch (#8676) (by **Bob Cao**)
+   - [vulkan] Let VMA grab fptrs itself (#8672) (by **Bob Cao**)
+   - [ci] Fix pre-commit rants, disable deprecated GitHub integration (#8671) (by **Proton**)
+   - [amdgpu] Calculate mcpu_ and compute_capability_ properly and with ROCm 6 compat (#8667) (by **Gavin Zhao**)
+   - [ci] Disable paddle tests, limit cmake version (#8670) (by **Proton**)
+   - [ci] Chasing migrated bot master (#8657) (by **Proton**)
+   - [doc] Fix typo in README.md (#8612) (by **Ikko Eltociear Ashimine**)
+   - [ci] Fixing releasing (#8651) (by **Proton**)
+   - [ci] Upgrade deprecated (up|down)load-artifacts@v3 actions to v4 (#8645) (by **Proton**)
+   - [Sparse Matrix] Fix dimension mismatch error in sparse matrix multiplication (#8644) (by **pengyu**)

From c5edfdb7f729f7991118622075f4a7288ad05824 Mon Sep 17 00:00:00 2001
From: johnnynunez <johnnynuca14@gmail.com>
Date: Wed, 11 Jun 2025 15:27:11 +0200
Subject: [PATCH 14/42] Fix include directive for IR analysis header in
 codegen_cuda.cpp

---
 taichi/codegen/cuda/codegen_cuda.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taichi/codegen/cuda/codegen_cuda.cpp b/taichi/codegen/cuda/codegen_cuda.cpp
index 4308d895111b6..cc5c9aedea0ce 100644
--- a/taichi/codegen/cuda/codegen_cuda.cpp
+++ b/taichi/codegen/cuda/codegen_cuda.cpp
@@ -14,7 +14,7 @@
 #include "taichi/rhi/cuda/cuda_context.h"
 #include "taichi/runtime/program_impls/llvm/llvm_program.h"
 #include "taichi/analysis/offline_cache_util.h"
-#include "taichi/ir/analysis.hh"
+#include "taichi/ir/analysis.h"
 #include "taichi/ir/transforms.h"
 #include "taichi/codegen/codegen_utils.h"
 

From 2d4703fd5f8622b8b3126c8c45a14cbaae0fd191 Mon Sep 17 00:00:00 2001
From: johnnynunez <johnnynuca14@gmail.com>
Date: Wed, 11 Jun 2025 15:29:41 +0200
Subject: [PATCH 15/42] Refactor JIT compilation in CUDA: update function
 pointers, enhance PTX handling, and implement new pass manager setup

---
 taichi/runtime/cpu/jit_cpu.cpp   |   4 +-
 taichi/runtime/cuda/jit_cuda.cpp | 118 +++++++++++++++++--------------
 2 files changed, 65 insertions(+), 57 deletions(-)

diff --git a/taichi/runtime/cpu/jit_cpu.cpp b/taichi/runtime/cpu/jit_cpu.cpp
index de096f982254f..5774b4a0e0d2e 100644
--- a/taichi/runtime/cpu/jit_cpu.cpp
+++ b/taichi/runtime/cpu/jit_cpu.cpp
@@ -192,7 +192,7 @@ class JITSessionCPU : public JITSession {
 #endif
     if (!symbol)
       TI_ERROR("Function \"{}\" not found", Name);
-    return (void *)(symbol->getAddress());
+    return symbol->toPtr<void>();
   }
 
   void *lookup_in_module(JITDylib *lib, const std::string Name) {
@@ -204,7 +204,7 @@ class JITSessionCPU : public JITSession {
 #endif
     if (!symbol)
       TI_ERROR("Function \"{}\" not found", Name);
-    return (void *)(symbol->getAddress());
+    return symbol->toPtr<void>();
   }
 };
 
diff --git a/taichi/runtime/cuda/jit_cuda.cpp b/taichi/runtime/cuda/jit_cuda.cpp
index 6f9d802294ffa..46860c562678c 100644
--- a/taichi/runtime/cuda/jit_cuda.cpp
+++ b/taichi/runtime/cuda/jit_cuda.cpp
@@ -1,6 +1,9 @@
+
 #include "taichi/runtime/cuda/jit_cuda.h"
 #include "taichi/runtime/llvm/llvm_context.h"
 
+#include "llvm/Passes/PassBuilder.h"
+
 namespace taichi::lang {
 
 #if defined(TI_WITH_CUDA)
@@ -13,9 +16,11 @@ JITModule *JITSessionCUDA ::add_module(std::unique_ptr<llvm::Module> M,
                                      "module NVPTX");
     writer.write(ptx);
   }
+
   // TODO: figure out why using the guard leads to wrong tests results
   // auto context_guard = CUDAContext::get_instance().get_guard();
   CUDAContext::get_instance().make_current();
+
   // Create module for object
   void *cuda_module;
   TI_TRACE("PTX size: {:.2f}KB", ptx.size() / 1024.0);
@@ -40,12 +45,13 @@ JITModule *JITSessionCUDA ::add_module(std::unique_ptr<llvm::Module> M,
   CUDADriver::get_instance().module_load_data_ex(
       &cuda_module, ptx.c_str(), num_options, options, option_values);
   TI_TRACE("CUDA module load time : {}ms", (Time::get_time() - t) * 1000);
-  // cudaModules.push_back(cudaModule);
+
   modules.push_back(std::make_unique<JITModuleCUDA>(cuda_module));
   return modules.back().get();
 }
 
 std::string cuda_mattrs() {
+  // TODO: upgrade to ptx78 as supported by LLVM 16
   return "+ptx63";
 }
 
@@ -98,7 +104,6 @@ std::string JITSessionCUDA::compile_module_to_ptx(
   llvm::Triple triple(module->getTargetTriple());
 
   // Allocate target machine
-
   std::string err_str;
   const llvm::Target *target =
       TargetRegistry::lookupTarget(triple.str(), err_str);
@@ -119,32 +124,25 @@ std::string JITSessionCUDA::compile_module_to_ptx(
     options.NoInfsFPMath = 0;
     options.NoNaNsFPMath = 0;
   }
+
   options.HonorSignDependentRoundingFPMathOption = 0;
   options.NoZerosInBSS = 0;
   options.GuaranteedTailCallOpt = 0;
 
+#if LLVM_VERSION_MAJOR >= 18
+  const auto opt_level = llvm::CodeGenOptLevel::Aggressive;
+#else
+  const auto opt_level = llvm::CodeGenOpt::Aggressive;
+#endif
   std::unique_ptr<TargetMachine> target_machine(target->createTargetMachine(
       triple.str(), CUDAContext::get_instance().get_mcpu(), cuda_mattrs(),
-      options, llvm::Reloc::PIC_, llvm::CodeModel::Small,
-      CodeGenOpt::Aggressive));
+      options, llvm::Reloc::PIC_, llvm::CodeModel::Small, opt_level));
 
   TI_ERROR_UNLESS(target_machine.get(), "Could not allocate target machine!");
 
+  module->setTargetTriple(triple.str());
   module->setDataLayout(target_machine->createDataLayout());
 
-  // Set up passes
-  llvm::SmallString<8> outstr;
-  raw_svector_ostream ostream(outstr);
-  ostream.SetUnbuffered();
-
-  legacy::FunctionPassManager function_pass_manager(module.get());
-  legacy::PassManager module_pass_manager;
-
-  module_pass_manager.add(createTargetTransformInfoWrapperPass(
-      target_machine->getTargetIRAnalysis()));
-  function_pass_manager.add(createTargetTransformInfoWrapperPass(
-      target_machine->getTargetIRAnalysis()));
-
   // NVidia's libdevice library uses a __nvvm_reflect to choose
   // how to handle denormalized numbers. (The pass replaces calls
   // to __nvvm_reflect with a constant via a map lookup. The inliner
@@ -178,51 +176,41 @@ std::string JITSessionCUDA::compile_module_to_ptx(
     }
   }
 
-  PassManagerBuilder b;
-  b.OptLevel = 3;
-  b.Inliner = createFunctionInliningPass(b.OptLevel, 0, false);
-  b.LoopVectorize = false;
-  b.SLPVectorize = false;
+  // Create the new analysis manager
+  llvm::LoopAnalysisManager LAM;
+  llvm::FunctionAnalysisManager FAM;
+  llvm::CGSCCAnalysisManager CGAM;
+  llvm::ModuleAnalysisManager MAM;
 
-  target_machine->adjustPassManager(b);
+  // Create the new pass builder
+  llvm::PipelineTuningOptions PTO;
+  PTO.LoopInterleaving = false;
+  PTO.LoopVectorization = false;
+  PTO.SLPVectorization = true;
+  PTO.LoopUnrolling = false;
+  PTO.ForgetAllSCEVInLoopUnroll = true;
 
-  b.populateFunctionPassManager(function_pass_manager);
-  b.populateModulePassManager(module_pass_manager);
+  llvm::PassBuilder PB(target_machine.get(), PTO);
 
-  // Override default to generate verbose assembly.
-  target_machine->Options.MCOptions.AsmVerbose = true;
-
-  /*
-    Optimization for llvm::GetElementPointer:
-    https://github.com/taichi-dev/taichi/issues/5472 The three other passes
-    "loop-reduce", "ind-vars", "cse" serves as preprocessing for
-    "separate-const-offset-gep".
+  PB.registerModuleAnalyses(MAM);
+  PB.registerCGSCCAnalyses(CGAM);
+  PB.registerFunctionAnalyses(FAM);
+  PB.registerLoopAnalyses(LAM);
+  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
 
-    Note there's an update for "separate-const-offset-gep" in llvm-12.
-  */
-  module_pass_manager.add(llvm::createLoopStrengthReducePass());
-  module_pass_manager.add(llvm::createIndVarSimplifyPass());
-  module_pass_manager.add(llvm::createSeparateConstOffsetFromGEPPass(false));
-  module_pass_manager.add(llvm::createEarlyCSEPass(true));
+  target_machine->registerPassBuilderCallbacks(PB);
 
-  // Ask the target to add backend passes as necessary.
-  bool fail = target_machine->addPassesToEmitFile(
-      module_pass_manager, ostream, nullptr, llvm::CGFT_AssemblyFile, true);
-
-  TI_ERROR_IF(fail, "Failed to set up passes to emit PTX source\n");
+  llvm::ModulePassManager MPM =
+      PB.buildPerModuleDefaultPipeline(llvm::OptimizationLevel::O3);
 
   {
-    TI_PROFILER("llvm_function_pass");
-    function_pass_manager.doInitialization();
-    for (llvm::Module::iterator i = module->begin(); i != module->end(); i++)
-      function_pass_manager.run(*i);
-
-    function_pass_manager.doFinalization();
+    TI_PROFILER("llvm_module_pass");
+    MPM.run(*module, MAM);
   }
 
-  {
-    TI_PROFILER("llvm_module_pass");
-    module_pass_manager.run(*module);
+  if (llvm::verifyModule(*module, &llvm::errs())) {
+    module->print(llvm::errs(), nullptr);
+    TI_ERROR("LLVM Module broken");
   }
 
   if (this->config_.print_kernel_llvm_ir_optimized) {
@@ -232,9 +220,29 @@ std::string JITSessionCUDA::compile_module_to_ptx(
     writer.write(module.get());
   }
 
-  std::string buffer(outstr.begin(), outstr.end());
+  llvm::SmallString<8> outstr;
+  raw_svector_ostream ostream(outstr);
+  ostream.SetUnbuffered();
 
-  // Null-terminate the ptx source
+  llvm::legacy::PassManager LPM;
+  LPM.add(createTargetTransformInfoWrapperPass(
+      target_machine->getTargetIRAnalysis()));
+
+  // Override default to generate verbose assembly.
+  target_machine->Options.MCOptions.AsmVerbose = true;
+
+#if LLVM_VERSION_MAJOR >= 18
+  const auto file_type = llvm::CodeGenFileType::AssemblyFile;
+#else
+  const auto file_type = llvm::CGFT_AssemblyFile;
+#endif
+  bool fail = target_machine->addPassesToEmitFile(LPM, ostream, nullptr,
+                                                  file_type, true);
+
+  TI_ERROR_IF(fail, "Failed to set up passes to emit PTX source\n");
+  LPM.run(*module);
+
+  std::string buffer(outstr.begin(), outstr.end());
   buffer.push_back(0);
   return buffer;
 }

From d2c87f67a74901e788c95dea3502840e3cdab8b0 Mon Sep 17 00:00:00 2001
From: johnnynunez <johnnynuca14@gmail.com>
Date: Wed, 11 Jun 2025 15:34:57 +0200
Subject: [PATCH 16/42] Update header includes and fix LLVM API calls in CPU
 code generation

---
 taichi/codegen/cpu/codegen_cpu.cpp   | 102 +++++++++------------------
 taichi/codegen/cuda/codegen_cuda.cpp |   3 +-
 taichi/runtime/cpu/jit_cpu.cpp       |   4 +-
 3 files changed, 37 insertions(+), 72 deletions(-)

diff --git a/taichi/codegen/cpu/codegen_cpu.cpp b/taichi/codegen/cpu/codegen_cpu.cpp
index 927de0780777c..778345da1cad1 100644
--- a/taichi/codegen/cpu/codegen_cpu.cpp
+++ b/taichi/codegen/cpu/codegen_cpu.cpp
@@ -7,22 +7,17 @@
 #include "taichi/util/io.h"
 #include "taichi/util/lang_util.h"
 #include "taichi/util/file_sequence_writer.h"
-#include "taichi/program/program.h"
+#include "taichi/program/program.hh"
 #include "taichi/ir/ir.h"
 #include "taichi/ir/statements.h"
 #include "taichi/ir/transforms.h"
 #include "taichi/ir/analysis.h"
 #include "taichi/analysis/offline_cache_util.h"
 
-// === CHANGED SECTION: HEADER INCLUDES ===
-// #include "llvm/Support/Host.h" // Obsolete: This header has been removed.
-#include "llvm/TargetParser/Host.h" // Replacement for Host.h to get sys::getHostCPUName()
+#include "llvm/TargetParser/Host.h"
 #include "llvm/MC/TargetRegistry.h"
-// #include "llvm/Transforms/IPO.h" // Obsolete: Part of the Legacy Pass Manager
-// #include "llvm/Transforms/IPO/PassManagerBuilder.h" // Obsolete: Part of the Legacy Pass Manager
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
-// New includes for the New Pass Manager (NPM)
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/StandardInstrumentations.h"
 #include "llvm/Transforms/Scalar/IndVarSimplify.h"
@@ -31,8 +26,6 @@
 #include "llvm/Transforms/Scalar/EarlyCSE.h"
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/IPO/InferFunctionAttrs.h"
-// === END OF CHANGED SECTION ===
-
 
 namespace taichi::lang {
 
@@ -54,28 +47,19 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM {
   void create_offload_range_for(OffloadedStmt *stmt) override {
     int step = 1;
 
-    // In parallel for-loops reversing the order doesn't make sense.
-    // However, we may need to support serial offloaded range for's in the
-    // future, so it still makes sense to reverse the order here.
     if (stmt->reversed) {
       step = -1;
     }
 
     auto *tls_prologue = create_xlogue(stmt->tls_prologue);
 
-    // The loop body
     llvm::Function *body;
     {
-      // === CHANGED SECTION: LLVM API CALL ===
+      // FIX: Use `llvm::PointerType::get(context, address_space)`
       auto guard = get_function_creation_guard(
           {llvm::PointerType::get(get_runtime_type("RuntimeContext"), 0),
-           // The call to `llvm::Type::getInt8PtrTy(*llvm_context)` was replaced.
-           // `getInt8PtrTy` was a convenience function for getting an i8* type,
-           // which has been removed. The modern equivalent for a generic pointer
-           // is `getPointerTy()`, which creates an opaque `ptr` type.
-           llvm::Type::getPointerTy(),
+           llvm::PointerType::get(*llvm_context, 0),
            tlctx->get_data_type<int>()});
-      // === END OF CHANGED SECTION ===
 
       auto loop_var = create_entry_block_alloca(PrimitiveType::i32);
       loop_vars_llvm[stmt].push_back(loop_var);
@@ -100,15 +84,11 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM {
 
     llvm::Function *body;
     {
-      // === CHANGED SECTION: LLVM API CALL ===
+      // FIX: Use `llvm::PointerType::get(context, address_space)`
       auto guard = get_function_creation_guard(
           {llvm::PointerType::get(get_runtime_type("RuntimeContext"), 0),
-           // Same change as in `create_offload_range_for`.
-           // The call to `llvm::Type::getInt8PtrTy(*llvm_context)` was replaced
-           // with the modern `llvm::Type::getPointerTy()`.
-           llvm::Type::getPointerTy(),
+           llvm::PointerType::get(*llvm_context, 0),
            tlctx->get_data_type<int>()});
-      // === END OF CHANGED SECTION ===
 
       for (int i = 0; i < stmt->mesh_prologue->size(); i++) {
         auto &s = stmt->mesh_prologue->statements[i];
@@ -179,12 +159,6 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM {
     bls_buffer = new llvm::GlobalVariable(
         *module, type, false, llvm::GlobalValue::ExternalLinkage, nullptr,
         "bls_buffer", nullptr, llvm::GlobalVariable::LocalExecTLSModel, 0);
-    /* module->getOrInsertGlobal("bls_buffer", type);
-    bls_buffer = module->getNamedGlobal("bls_buffer");
-    bls_buffer->setAlignment(llvm::MaybeAlign(8));*/ // TODO(changyu): Fix JIT session error: Symbols not found: [ __emutls_get_address ] in python 3.10
-
-    // initialize the variable with an undef value to ensure it is added to the
-    // symbol table
     bls_buffer->setInitializer(llvm::UndefValue::get(type));
   }
 
@@ -266,13 +240,9 @@ LLVMCompiledTask KernelCodeGenCPU::compile_task(
   return gen.run_compilation();
 }
 
-
-// === CHANGED SECTION: ENTIRE FUNCTION REWRITTEN ===
-// The `optimize_module` function has been completely rewritten to use the
-// New Pass Manager (NPM) instead of the removed Legacy Pass Manager (LPM).
 void KernelCodeGenCPU::optimize_module(llvm::Module *module) {
-  TI_AUTO_PROF
-  
+  TI_AUTO_PROF;
+
   const auto &compile_config = get_compile_config();
   auto triple = get_host_target_triple();
   module->setTargetTriple(triple.str());
@@ -294,71 +264,68 @@ void KernelCodeGenCPU::optimize_module(llvm::Module *module) {
   options.GuaranteedTailCallOpt = false;
 
   llvm::StringRef mcpu = llvm::sys::getHostCPUName();
+  // FIX: `CodeGenOpt::Aggressive` is removed. Optimizations are handled by PassBuilder.
   std::unique_ptr<llvm::TargetMachine> target_machine(
       target->createTargetMachine(triple.str(), mcpu.str(), "", options,
                                   llvm::Reloc::PIC_, llvm::CodeModel::Small,
-                                  llvm::CodeGenOpt::Aggressive));
+                                  llvm::CodeGenOpt::None));
 
   TI_ERROR_UNLESS(target_machine, "Could not allocate target machine!");
-
   module->setDataLayout(target_machine->createDataLayout());
 
-  // === New Pass Manager Setup ===
-  // 1. Create the analysis managers.
   llvm::LoopAnalysisManager LAM;
   llvm::FunctionAnalysisManager FAM;
   llvm::CGSCCAnalysisManager CGAM;
   llvm::ModuleAnalysisManager MAM;
-
-  // 2. Create the PassBuilder. It's the main entry point for the NPM.
   llvm::PassBuilder PB(target_machine.get());
 
-  // 3. Register all the standard analyses with the managers.
   FAM.registerPass([&] { return target_machine->getTargetIRAnalysis(); });
+
   PB.registerModuleAnalyses(MAM);
   PB.registerCGSCCAnalyses(CGAM);
   PB.registerFunctionAnalyses(FAM);
   PB.registerLoopAnalyses(LAM);
   PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
 
-  // 4. Create the main pass manager.
-  llvm::ModulePassManager MPM;
+  // FIX: `PassBuilder::OptimizationLevel` is now `llvm::OptimizationLevel`
+  llvm::OptimizationLevel opt_level = llvm::OptimizationLevel::O3;
 
-  // 5. Build the default optimization pipeline for a given level (e.g., O3).
-  // This single line replaces the old `PassManagerBuilder` and `populate...` calls.
-  llvm::PassBuilder::OptimizationLevel opt_level = llvm::PassBuilder::OptimizationLevel::O3;
-  MPM = PB.buildPerModuleDefaultPipeline(opt_level);
+  llvm::ModulePassManager MPM = PB.buildPerModuleDefaultPipeline(opt_level);
 
-  // 6. Add the custom passes that Taichi used in the old code.
-  // Since they are FunctionPasses, they must be wrapped in an adaptor to be
-  // added to a ModulePassManager.
-  MPM.addPass(llvm::createModuleToFunctionPassAdaptor(llvm::LoopStrengthReducePass()));
-  MPM.addPass(llvm::createModuleToFunctionPassAdaptor(llvm::IndVarSimplifyPass()));
-  MPM.addPass(llvm::createModuleToFunctionPassAdaptor(llvm::SeparateConstOffsetFromGEPPass(false)));
-  MPM.addPass(llvm::createModuleToFunctionPassAdaptor(llvm::EarlyCSEPass(true)));
+  // FIX: Correctly adapt Loop and Function passes to the Module pipeline.
+  llvm::FunctionPassManager FPM;
+  // Loop passes must be adapted to run in a function pass pipeline.
+  FPM.addPass(llvm::createFunctionToLoopPassAdaptor(llvm::LoopStrengthReducePass()));
+  FPM.addPass(llvm::createFunctionToLoopPassAdaptor(llvm::IndVarSimplifyPass()));
+  // These are function passes, so they can be added directly.
+  FPM.addPass(llvm::SeparateConstOffsetFromGEPPass(false));
+  FPM.addPass(llvm::EarlyCSEPass(true));
+  // Adapt the entire function pass manager to run in the module pipeline.
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
 
-  llvm::raw_svector_ostream *asm_ostream_ptr = nullptr;
-  llvm::SmallString<0> asm_buffer;
 
+  llvm::SmallString<0> asm_buffer;
   if (compile_config.print_kernel_asm) {
-    // To emit assembly with NPM, you add a pass to the pipeline that writes to a stream.
-    llvm::raw_svector_ostream &asm_stream = asm_buffer;
-    asm_ostream_ptr = &asm_stream;
-    asm_ostream_ptr->SetUnbuffered();
-    if (auto err = target_machine->addPassesToEmitFile(MPM, *asm_ostream_ptr, nullptr, llvm::CGFT_AssemblyFile)) {
-        TI_ERROR("Failed to addPassesToEmitFile");
+    // FIX: `raw_svector_ostream` must be constructed directly on the buffer
+    llvm::raw_svector_ostream asm_stream(asm_buffer);
+    asm_stream.SetUnbuffered();
+    // FIX: `CGFT_AssemblyFile` is now `CodeGenFileType::AssemblyFile`
+    if (auto err = target_machine->addPassesToEmitFile(
+            MPM, asm_stream, nullptr, llvm::CodeGenFileType::AssemblyFile)) {
+      TI_ERROR("Failed to addPassesToEmitFile");
     }
   }
+
   {
     TI_PROFILER("llvm_module_pass");
     MPM.run(*module, MAM);
   }
 
-  // After MPM.run(), the asm_buffer will be populated if print_kernel_asm was true.
   if (compile_config.print_kernel_asm) {
     static FileSequenceWriter writer(
         "taichi_kernel_cpu_llvm_ir_optimized_asm_{:04d}.s",
         "optimized assembly code (CPU)");
+    // The buffer is only filled after MPM.run(), so we get the content now.
     writer.write(std::string(asm_buffer.str()));
   }
 
@@ -374,6 +341,5 @@ void KernelCodeGenCPU::optimize_module(llvm::Module *module) {
   }
 }
 
-
 #endif  // TI_WITH_LLVM
 }  // namespace taichi::lang
\ No newline at end of file
diff --git a/taichi/codegen/cuda/codegen_cuda.cpp b/taichi/codegen/cuda/codegen_cuda.cpp
index cc5c9aedea0ce..a88be5ef9b337 100644
--- a/taichi/codegen/cuda/codegen_cuda.cpp
+++ b/taichi/codegen/cuda/codegen_cuda.cpp
@@ -76,8 +76,7 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
     return LLVMModuleBuilder::call(
         builder.get(), "vprintf",
         builder->CreateGlobalStringPtr(format, "format_string"),
-        builder->CreateBitCast(value_arr,
-                               llvm::Type::getPointerTy()));
+        builder->CreateBitCast(value_arr, llvm::PointerType::get(*llvm_context, 0)));
     // === END OF CHANGED SECTION ===
   }
 
diff --git a/taichi/runtime/cpu/jit_cpu.cpp b/taichi/runtime/cpu/jit_cpu.cpp
index 5774b4a0e0d2e..f7fe013f2c773 100644
--- a/taichi/runtime/cpu/jit_cpu.cpp
+++ b/taichi/runtime/cpu/jit_cpu.cpp
@@ -192,7 +192,7 @@ class JITSessionCPU : public JITSession {
 #endif
     if (!symbol)
       TI_ERROR("Function \"{}\" not found", Name);
-    return symbol->toPtr<void>();
+    return symbol->toPtr<void*>();
   }
 
   void *lookup_in_module(JITDylib *lib, const std::string Name) {
@@ -204,7 +204,7 @@ class JITSessionCPU : public JITSession {
 #endif
     if (!symbol)
       TI_ERROR("Function \"{}\" not found", Name);
-    return symbol->toPtr<void>();
+    return symbol->toPtr<void*>();
   }
 };
 

From ad65ec9ba5957fa63ea2a28fcb1114e2d9dcd735 Mon Sep 17 00:00:00 2001
From: johnnynunez <johnnynuca14@gmail.com>
Date: Wed, 11 Jun 2025 15:36:42 +0200
Subject: [PATCH 17/42] Fix header include for program in codegen_cpu.cpp

---
 taichi/codegen/cpu/codegen_cpu.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taichi/codegen/cpu/codegen_cpu.cpp b/taichi/codegen/cpu/codegen_cpu.cpp
index 778345da1cad1..151629faf4705 100644
--- a/taichi/codegen/cpu/codegen_cpu.cpp
+++ b/taichi/codegen/cpu/codegen_cpu.cpp
@@ -7,7 +7,7 @@
 #include "taichi/util/io.h"
 #include "taichi/util/lang_util.h"
 #include "taichi/util/file_sequence_writer.h"
-#include "taichi/program/program.hh"
+#include "taichi/program/program.h"
 #include "taichi/ir/ir.h"
 #include "taichi/ir/statements.h"
 #include "taichi/ir/transforms.h"

From 1be07f3ef9b4654340427e8aeff0c047373864c1 Mon Sep 17 00:00:00 2001
From: Tiffany Mintz <Tiffany.Mintz@amd.com>
Date: Tue, 19 Aug 2025 12:56:43 -0500
Subject: [PATCH 18/42] cmake build updates, header fixes; Merging from commits
 ebdc72be75443d8785703e0c8af1236be237fc04 to
 9d140238ed1fa1f33b077a5fe74d61905b0167d4 from johnnynunez/taichi master
 branch

---
 .../workflows/scripts/ti_build/compiler.py    |   2 +-
 .github/workflows/scripts/ti_build/llvm.py    |  13 +-
 CMakeLists.txt                                |   4 +-
 python/taichi/CHANGELOG.md                    |  13 +
 taichi/codegen/cpu/codegen_cpu.cpp            |  44 +--
 taichi/codegen/llvm/codegen_llvm.cpp          |  42 ++-
 taichi/codegen/llvm/codegen_llvm.h            |   3 +-
 taichi/codegen/llvm/llvm_codegen_utils.cpp    |  10 +-
 taichi/codegen/llvm/struct_llvm.cpp           |  18 +-
 taichi/common/core.h                          |   4 +-
 taichi/common/types.h                         |   4 +-
 taichi/math/linalg.h                          |   5 +-
 taichi/runtime/cuda/jit_cuda.cpp              | 320 +++++++++---------
 taichi/runtime/llvm/llvm_context.cpp          |  23 +-
 14 files changed, 275 insertions(+), 230 deletions(-)

diff --git a/.github/workflows/scripts/ti_build/compiler.py b/.github/workflows/scripts/ti_build/compiler.py
index b09ddea9da3c3..bba1983469636 100644
--- a/.github/workflows/scripts/ti_build/compiler.py
+++ b/.github/workflows/scripts/ti_build/compiler.py
@@ -33,7 +33,7 @@ def setup_clang(as_compiler=True) -> None:
     """
     u = platform.uname()
     if u.system == "Linux":
-        for v in ("", "-14", "-13", "-12", "-11", "-10"):
+        for v in ("", "-20", "-19", "-18", "-17", "-16", "-15", "-14", "-13", "-12", "-11", "-10"):
             clang = shutil.which(f"clang{v}")
             if clang is not None:
                 clangpp = shutil.which(f"clang++{v}")
diff --git a/.github/workflows/scripts/ti_build/llvm.py b/.github/workflows/scripts/ti_build/llvm.py
index e487a560b4a03..9619c11ba42b9 100644
--- a/.github/workflows/scripts/ti_build/llvm.py
+++ b/.github/workflows/scripts/ti_build/llvm.py
@@ -1,3 +1,4 @@
+
 # -*- coding: utf-8 -*-
 
 # -- stdlib --
@@ -19,7 +20,7 @@ def setup_llvm() -> None:
     Download and install LLVM.
     """
     u = platform.uname()
-    if u.system == "Linux":
+    if (u.system, u.machine) == ("Linux", "x86_64"):
         if cmake_args.get_effective("TI_WITH_AMDGPU"):
             out = get_cache_home() / "llvm15-amdgpu-005"
             url = "https://github.com/GaleSeLee/assets/releases/download/v0.0.5/taichi-llvm-15.0.0-linux.zip"
@@ -31,6 +32,11 @@ def setup_llvm() -> None:
             out = get_cache_home() / "llvm15"
             url = "https://github.com/taichi-dev/taichi_assets/releases/download/llvm15/taichi-llvm-15-linux.zip"
         download_dep(url, out, strip=1)
+
+    elif (u.system, u.machine) in (("Linux", "arm64"), ("Linux", "aarch64")):
+        out = get_cache_home() / "llvm15-manylinux2014"
+        # FIXME: ARM LLVM!
+        pass
     elif (u.system, u.machine) == ("Darwin", "arm64"):
         out = get_cache_home() / "llvm15-m1-nozstd"
         url = "https://github.com/taichi-dev/taichi_assets/releases/download/llvm15/taichi-llvm-15-m1-nozstd.zip"
@@ -48,4 +54,7 @@ def setup_llvm() -> None:
 
     # We should use LLVM toolchains shipped with OS.
     # path_prepend('PATH', out / 'bin')
-    os.environ["LLVM_DIR"] = str(out)
+    if (u.system, u.machine) not in (("Linux", "arm64"), ("Linux", "aarch64")):
+        os.environ["LLVM_DIR"] = "/usr/lib/llvm-20/cmake"
+        os.environ["CUDA_HOME"] = "/usr/local/cuda"
+        os.environ["CPATH"] = "/usr/local/cuda/include"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 086e831fcb95e..e97e6da2e8fa8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -171,7 +171,7 @@ if (TI_WITH_LLVM)
       set (CLANG_EXECUTABLE ${CMAKE_CXX_COMPILER})
       message("Clang executable using host compiler ${CLANG_EXECUTABLE}")
     else()
-      find_program(CLANG_EXECUTABLE NAMES clang-15 clang-14 clang-13 clang-12 clang-11 clang-10 clang-9 clang-8 clang-7 clang)
+        find_program(CLANG_EXECUTABLE NAMES clang-20 clang-19 clang-18 clang-17 clang-16 clang-15 clang-14 clang-13 clang-12 clang-11 clang-10 clang-9 clang-8 clang-7 clang)
       message("Clang executable found at ${CLANG_EXECUTABLE}")
     endif()
 
@@ -193,7 +193,7 @@ if (TI_WITH_LLVM)
     endif()
 
     # Highest clang version that we've tested
-    set(CLANG_HIGHEST_VERSION "15")
+    set(CLANG_HIGHEST_VERSION "20")
 
     check_clang_version()
 
diff --git a/python/taichi/CHANGELOG.md b/python/taichi/CHANGELOG.md
index 5f509bb3df92b..ae49964ed6633 100644
--- a/python/taichi/CHANGELOG.md
+++ b/python/taichi/CHANGELOG.md
@@ -12,6 +12,19 @@ Highlights:
       - Fix the Unified Allocator to no longer return first two allocations as dupes (#8705) (by **Hugh Perkins**)
 
 Full changelog:
+   - Update header includes and fix configuration access in JIT compilation for CUDA (by **johnnynunez**)
+   - Fix header includes and update LLVM API calls in CPU code generation (by **johnnynunez**)
+   - Update CUDA header includes and fix call signatures in JIT compilation (by **johnnynunez**)
+   - Fix header includes and update configuration access in JIT compilation for CUDA (by **johnnynunez**)
+   - Fix header include for program in codegen_cpu.cpp (by **johnnynunez**)
+   - Update header includes and fix LLVM API calls in CPU code generation (by **johnnynunez**)
+   - Fix header include for PassBuilder in jit_cuda.cpp (by **johnnynunez**)
+   - Refactor JIT compilation in CUDA: update function pointers, enhance PTX handling, and implement new pass manager setup (by **johnnynunez**)
+   - Fix include directive for IR analysis header in codegen_cuda.cpp (by **johnnynunez**)
+   - Update Vulkan setup to check for x86_64 architecture on Linux (by **johnnynunez**)
+   - Add CHANGELOG.md to document recent updates and improvements (by **johnnynunez**)
+   - Update LLVM API calls in codegen_cuda.cpp for compatibility with recent changes (by **johnnynunez**)
+   - LLVM-20 (by **johnnynunez**)
    - Refactor CMakeLists.txt and improve IR dumping functionality in various files (by **johnnynunez**)
    - Remove unnecessary print statement from IRNode handling (by **johnnynunez**)
    - Refactor ASTBuilder constructor declaration for improved clarity (by **johnnynunez**)
diff --git a/taichi/codegen/cpu/codegen_cpu.cpp b/taichi/codegen/cpu/codegen_cpu.cpp
index 151629faf4705..495a0d512bd8f 100644
--- a/taichi/codegen/cpu/codegen_cpu.cpp
+++ b/taichi/codegen/cpu/codegen_cpu.cpp
@@ -1,5 +1,3 @@
-// taichi/codegen/cpu/codegen_cpu.cpp
-
 #include "taichi/codegen/cpu/codegen_cpu.h"
 
 #include "taichi/runtime/program_impls/llvm/llvm_program.h"
@@ -46,33 +44,24 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM {
 
   void create_offload_range_for(OffloadedStmt *stmt) override {
     int step = 1;
-
     if (stmt->reversed) {
       step = -1;
     }
-
     auto *tls_prologue = create_xlogue(stmt->tls_prologue);
-
     llvm::Function *body;
     {
-      // FIX: Use `llvm::PointerType::get(context, address_space)`
       auto guard = get_function_creation_guard(
           {llvm::PointerType::get(get_runtime_type("RuntimeContext"), 0),
            llvm::PointerType::get(*llvm_context, 0),
            tlctx->get_data_type<int>()});
-
       auto loop_var = create_entry_block_alloca(PrimitiveType::i32);
       loop_vars_llvm[stmt].push_back(loop_var);
       builder->CreateStore(get_arg(2), loop_var);
       stmt->body->accept(this);
-
       body = guard.body;
     }
-
     llvm::Value *epilogue = create_xlogue(stmt->tls_epilogue);
-
     auto [begin, end] = get_range_for_bounds(stmt);
-
     call("cpu_parallel_range_for", get_arg(0),
          tlctx->get_constant(stmt->num_cpu_threads), begin, end,
          tlctx->get_constant(step), tlctx->get_constant(stmt->block_dim),
@@ -81,24 +70,19 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM {
 
   void create_offload_mesh_for(OffloadedStmt *stmt) override {
     auto *tls_prologue = create_mesh_xlogue(stmt->tls_prologue);
-
     llvm::Function *body;
     {
-      // FIX: Use `llvm::PointerType::get(context, address_space)`
       auto guard = get_function_creation_guard(
           {llvm::PointerType::get(get_runtime_type("RuntimeContext"), 0),
            llvm::PointerType::get(*llvm_context, 0),
            tlctx->get_data_type<int>()});
-
       for (int i = 0; i < stmt->mesh_prologue->size(); i++) {
         auto &s = stmt->mesh_prologue->statements[i];
         s->accept(this);
       }
-
       if (stmt->bls_prologue) {
         stmt->bls_prologue->accept(this);
       }
-
       auto loop_test_bb =
           llvm::BasicBlock::Create(*llvm_context, "loop_test", func);
       auto loop_body_bb =
@@ -109,7 +93,6 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM {
           create_entry_block_alloca(llvm::Type::getInt32Ty(*llvm_context));
       builder->CreateStore(tlctx->get_constant(0), loop_index);
       builder->CreateBr(loop_test_bb);
-
       {
         builder->SetInsertPoint(loop_test_bb);
         auto *loop_index_load =
@@ -120,7 +103,6 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM {
                          ->second]);
         builder->CreateCondBr(cond, loop_body_bb, func_exit);
       }
-
       {
         builder->SetInsertPoint(loop_body_bb);
         loop_vars_llvm[stmt].push_back(loop_index);
@@ -136,16 +118,12 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM {
         builder->CreateBr(loop_test_bb);
         builder->SetInsertPoint(func_exit);
       }
-
       if (stmt->bls_epilogue) {
         stmt->bls_epilogue->accept(this);
       }
-
       body = guard.body;
     }
-
     llvm::Value *epilogue = create_mesh_xlogue(stmt->tls_epilogue);
-
     call("cpu_parallel_mesh_for", get_arg(0),
          tlctx->get_constant(stmt->num_cpu_threads),
          tlctx->get_constant(stmt->mesh->num_patches),
@@ -264,11 +242,11 @@ void KernelCodeGenCPU::optimize_module(llvm::Module *module) {
   options.GuaranteedTailCallOpt = false;
 
   llvm::StringRef mcpu = llvm::sys::getHostCPUName();
-  // FIX: `CodeGenOpt::Aggressive` is removed. Optimizations are handled by PassBuilder.
+  // FIX: Use llvm::CodeGenOptLevel
   std::unique_ptr<llvm::TargetMachine> target_machine(
       target->createTargetMachine(triple.str(), mcpu.str(), "", options,
                                   llvm::Reloc::PIC_, llvm::CodeModel::Small,
-                                  llvm::CodeGenOpt::None));
+                                  compile_config.opt_level > 0 ? llvm::CodeGenOptLevel::Default : llvm::CodeGenOptLevel::None));
 
   TI_ERROR_UNLESS(target_machine, "Could not allocate target machine!");
   module->setDataLayout(target_machine->createDataLayout());
@@ -287,30 +265,25 @@ void KernelCodeGenCPU::optimize_module(llvm::Module *module) {
   PB.registerLoopAnalyses(LAM);
   PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
 
-  // FIX: `PassBuilder::OptimizationLevel` is now `llvm::OptimizationLevel`
   llvm::OptimizationLevel opt_level = llvm::OptimizationLevel::O3;
 
-  llvm::ModulePassManager MPM = PB.buildPerModuleDefaultPipeline(opt_level);
+  llvm::ModulePassManager MPM;
+  if (compile_config.opt_level > 0) {
+    MPM = PB.buildPerModuleDefaultPipeline(opt_level);
+  }
 
-  // FIX: Correctly adapt Loop and Function passes to the Module pipeline.
   llvm::FunctionPassManager FPM;
-  // Loop passes must be adapted to run in a function pass pipeline.
   FPM.addPass(llvm::createFunctionToLoopPassAdaptor(llvm::LoopStrengthReducePass()));
   FPM.addPass(llvm::createFunctionToLoopPassAdaptor(llvm::IndVarSimplifyPass()));
-  // These are function passes, so they can be added directly.
   FPM.addPass(llvm::SeparateConstOffsetFromGEPPass(false));
   FPM.addPass(llvm::EarlyCSEPass(true));
-  // Adapt the entire function pass manager to run in the module pipeline.
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
 
-
   llvm::SmallString<0> asm_buffer;
   if (compile_config.print_kernel_asm) {
-    // FIX: `raw_svector_ostream` must be constructed directly on the buffer
     llvm::raw_svector_ostream asm_stream(asm_buffer);
-    asm_stream.SetUnbuffered();
-    // FIX: `CGFT_AssemblyFile` is now `CodeGenFileType::AssemblyFile`
-    if (auto err = target_machine->addPassesToEmitFile(
+    // FIX: Pass the stream by reference, not by pointer
+    if (target_machine->addPassesToEmitFile(
             MPM, asm_stream, nullptr, llvm::CodeGenFileType::AssemblyFile)) {
       TI_ERROR("Failed to addPassesToEmitFile");
     }
@@ -325,7 +298,6 @@ void KernelCodeGenCPU::optimize_module(llvm::Module *module) {
     static FileSequenceWriter writer(
         "taichi_kernel_cpu_llvm_ir_optimized_asm_{:04d}.s",
         "optimized assembly code (CPU)");
-    // The buffer is only filled after MPM.run(), so we get the content now.
     writer.write(std::string(asm_buffer.str()));
   }
 
diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp
index 85e9b1788098e..c0f4d8af21fd0 100644
--- a/taichi/codegen/llvm/codegen_llvm.cpp
+++ b/taichi/codegen/llvm/codegen_llvm.cpp
@@ -1,3 +1,4 @@
+
 #include "taichi/codegen/llvm/codegen_llvm.h"
 
 #include <algorithm>
@@ -199,6 +200,9 @@ void TaskCodeGenLLVM::emit_extra_unary(UnaryOpStmt *stmt) {
   UNARY_STD(asin)
   UNARY_STD(cos)
   UNARY_STD(sin)
+  UNARY_STD(erf)
+  UNARY_STD(erfc)
+
   else if (op == UnaryOpType::sqrt) {
     llvm_val[stmt] =
         builder->CreateIntrinsic(llvm::Intrinsic::sqrt, {input_type}, {input});
@@ -1679,10 +1683,9 @@ llvm::Value *TaskCodeGenLLVM::call(
   auto prefix = get_runtime_snode_name(snode);
   auto s = emit_struct_meta(snode);
   auto s_ptr =
-      builder->CreateBitCast(s, llvm::Type::getInt8PtrTy(*llvm_context));
-
-  node_ptr =
-      builder->CreateBitCast(node_ptr, llvm::Type::getInt8PtrTy(*llvm_context));
+      builder->CreateBitCast(s, llvm::PointerType::get(*llvm_context, 0));
+  node_ptr = builder->CreateBitCast(node_ptr,
+                                    llvm::PointerType::get(*llvm_context, 0));
 
   std::vector<llvm::Value *> func_arguments{s_ptr, node_ptr};
 
@@ -1794,14 +1797,18 @@ void TaskCodeGenLLVM::visit(SNodeLookupStmt *stmt) {
   parent = llvm_val[stmt->input_snode];
   TI_ASSERT(parent);
   auto snode = stmt->snode;
+
   if (snode->type == SNodeType::root) {
     // FIXME: get parent_type from taichi instead of llvm.
     llvm::Type *parent_ty = builder->getInt8Ty();
+
     if (auto bit_cast = llvm::dyn_cast<llvm::BitCastInst>(parent)) {
       parent_ty = bit_cast->getDestTy();
-      if (auto ptr_ty = llvm::dyn_cast<llvm::PointerType>(parent_ty))
-        parent_ty = ptr_ty->getPointerElementType();
+      if (auto ptr_ty = llvm::dyn_cast<llvm::PointerType>(parent_ty)) {
+        TI_NOT_IMPLEMENTED;
+      }
     }
+
     llvm_val[stmt] =
         builder->CreateGEP(parent_ty, parent, llvm_val[stmt->input_index]);
   } else if (snode->type == SNodeType::dense ||
@@ -1842,7 +1849,7 @@ void TaskCodeGenLLVM::visit(GetChStmt *stmt) {
         stmt->output_snode->get_snode_tree_id(),
         stmt->output_snode->get_ch_from_parent_func_name(),
         builder->CreateBitCast(llvm_val[stmt->input_ptr],
-                               llvm::PointerType::getInt8PtrTy(*llvm_context)));
+                               llvm::PointerType::get(*llvm_context, 0)));
     llvm_val[stmt] = builder->CreateBitCast(
         ch, llvm::PointerType::get(StructCompilerLLVM::get_llvm_node_type(
                                        module.get(), stmt->output_snode),
@@ -2436,8 +2443,8 @@ void TaskCodeGenLLVM::visit(AdStackAllocaStmt *stmt) {
   auto type = llvm::ArrayType::get(llvm::Type::getInt8Ty(*llvm_context),
                                    stmt->size_in_bytes());
   auto alloca = create_entry_block_alloca(type, sizeof(int64));
-  llvm_val[stmt] = builder->CreateBitCast(
-      alloca, llvm::PointerType::getInt8PtrTy(*llvm_context));
+  llvm_val[stmt] =
+      builder->CreateBitCast(alloca, llvm::PointerType::get(*llvm_context, 0));
   call("stack_init", llvm_val[stmt]);
 }
 
@@ -2629,7 +2636,7 @@ llvm::Value *TaskCodeGenLLVM::get_tls_base_ptr() {
 }
 
 llvm::Type *TaskCodeGenLLVM::get_tls_buffer_type() {
-  return llvm::Type::getInt8PtrTy(*llvm_context);
+  return llvm::PointerType::get(*llvm_context, 0);
 }
 
 std::vector<llvm::Type *> TaskCodeGenLLVM::get_xlogue_argument_types() {
@@ -2653,6 +2660,7 @@ llvm::Type *TaskCodeGenLLVM::get_mesh_xlogue_function_type() {
 }
 
 llvm::PointerType *TaskCodeGenLLVM::get_integer_ptr_type(int bits) {
+#if 0
   switch (bits) {
     case 8:
       return llvm::Type::getInt8PtrTy(*llvm_context);
@@ -2665,6 +2673,18 @@ llvm::PointerType *TaskCodeGenLLVM::get_integer_ptr_type(int bits) {
     default:
       break;
   }
+#else
+  // opaque pointer
+  switch (bits) {
+    case 8:
+    case 16:
+    case 32:
+    case 64:
+      return llvm::PointerType::get(*llvm_context, 0);
+    default:
+      break;
+  }
+#endif
   TI_ERROR("No compatible " + std::to_string(bits) + " bits integer ptr type.");
   return nullptr;
 }
@@ -2989,4 +3009,4 @@ LLVMCompiledKernel LLVMCompiledKernel::clone() const {
 
 }  // namespace taichi::lang
 
-#endif  // #ifdef TI_WITH_LLVM
+#endif  // #ifdef TI_WITH_LLVM
\ No newline at end of file
diff --git a/taichi/codegen/llvm/codegen_llvm.h b/taichi/codegen/llvm/codegen_llvm.h
index 00d025b61d1e6..da6ba2b1279a2 100644
--- a/taichi/codegen/llvm/codegen_llvm.h
+++ b/taichi/codegen/llvm/codegen_llvm.h
@@ -1,3 +1,4 @@
+
 // The LLVM backend for CPUs/NVPTX/AMDGPU
 #pragma once
 
@@ -438,4 +439,4 @@ class TaskCodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
 
 }  // namespace taichi::lang
 
-#endif  // #ifdef TI_WITH_LLVM
+#endif  // #ifdef TI_WITH_LLVM
\ No newline at end of file
diff --git a/taichi/codegen/llvm/llvm_codegen_utils.cpp b/taichi/codegen/llvm/llvm_codegen_utils.cpp
index 99a06c403e61f..0f23f14ed0f9c 100644
--- a/taichi/codegen/llvm/llvm_codegen_utils.cpp
+++ b/taichi/codegen/llvm/llvm_codegen_utils.cpp
@@ -25,15 +25,23 @@ bool is_same_type(llvm::Type *a, llvm::Type *b) {
   if (a == b) {
     return true;
   }
+
   if (a->isPointerTy() != b->isPointerTy()) {
     return false;
   }
+
   if (a->isPointerTy()) {
+#if 0
     return is_same_type(a->getPointerElementType(), b->getPointerElementType());
+#else
+    return true;
+#endif
   }
+
   if (a->isFunctionTy() != b->isFunctionTy()) {
     return false;
   }
+
   if (a->isFunctionTy()) {
     auto req_func = llvm::cast<llvm::FunctionType>(a);
     auto prov_func = llvm::cast<llvm::FunctionType>(b);
@@ -144,4 +152,4 @@ void check_func_call_signature(llvm::FunctionType *func_type,
   }
 }
 
-}  // namespace taichi::lang
+}  // namespace taichi::lang
\ No newline at end of file
diff --git a/taichi/codegen/llvm/struct_llvm.cpp b/taichi/codegen/llvm/struct_llvm.cpp
index 849b1f0b3bb3d..5e10b6b3116de 100644
--- a/taichi/codegen/llvm/struct_llvm.cpp
+++ b/taichi/codegen/llvm/struct_llvm.cpp
@@ -8,7 +8,6 @@
 #include "taichi/util/file_sequence_writer.h"
 
 namespace taichi::lang {
-
 StructCompilerLLVM::StructCompilerLLVM(Arch arch,
                                        const CompileConfig &config,
                                        TaichiLLVMContext *tlctx,
@@ -104,14 +103,14 @@ void StructCompilerLLVM::generate_types(SNode &snode) {
     // mutex
     aux_type = llvm::ArrayType::get(llvm::PointerType::getInt64Ty(*ctx),
                                     snode.max_num_elements());
-    body_type = llvm::ArrayType::get(llvm::PointerType::getInt8PtrTy(*ctx),
+    body_type = llvm::ArrayType::get(llvm::PointerType::get(*ctx, 0),
                                      snode.max_num_elements());
   } else if (type == SNodeType::dynamic) {
     // mutex and n (number of elements)
     aux_type =
         llvm::StructType::get(*ctx, {llvm::PointerType::getInt32Ty(*ctx),
                                      llvm::PointerType::getInt32Ty(*ctx)});
-    body_type = llvm::PointerType::getInt8PtrTy(*ctx);
+    body_type = llvm::PointerType::get(*ctx, 0);
   } else {
     TI_P(snode.type_name());
     TI_NOT_IMPLEMENTED;
@@ -206,10 +205,9 @@ void StructCompilerLLVM::generate_child_accessors(SNode &snode) {
 
     auto inp_type =
         llvm::PointerType::get(get_llvm_element_type(module.get(), parent), 0);
-
     auto ft =
-        llvm::FunctionType::get(llvm::Type::getInt8PtrTy(*llvm_ctx_),
-                                {llvm::Type::getInt8PtrTy(*llvm_ctx_)}, false);
+        llvm::FunctionType::get(llvm::PointerType::get(*llvm_ctx_, 0),
+                                {llvm::PointerType::get(*llvm_ctx_, 0)}, false);
 
     auto func = create_function(ft, snode.get_ch_from_parent_func_name());
 
@@ -221,15 +219,14 @@ void StructCompilerLLVM::generate_child_accessors(SNode &snode) {
     for (auto &arg : func->args()) {
       args.push_back(&arg);
     }
+
     llvm::Value *ret;
     ret = builder.CreateGEP(get_llvm_element_type(module.get(), parent),
                             builder.CreateBitCast(args[0], inp_type),
                             {tlctx_->get_constant(0),
                              tlctx_->get_constant(parent->child_id(&snode))},
                             "getch");
-
-    builder.CreateRet(
-        builder.CreateBitCast(ret, llvm::Type::getInt8PtrTy(*llvm_ctx_)));
+    builder.CreateRet(ret);
   }
 
   for (auto &ch : snode.ch) {
@@ -311,5 +308,4 @@ llvm::Function *StructCompilerLLVM::create_function(llvm::FunctionType *ft,
   return llvm::Function::Create(ft, llvm::Function::ExternalLinkage, func_name,
                                 *module);
 }
-
-}  // namespace taichi::lang
+}  // namespace taichi::lang
\ No newline at end of file
diff --git a/taichi/common/core.h b/taichi/common/core.h
index f112f8f35baad..0f08bb594548a 100644
--- a/taichi/common/core.h
+++ b/taichi/common/core.h
@@ -167,14 +167,14 @@ using real = float32;
 
 // Float literal for both float32/64
 // (Learned from https://github.com/hi2p-perim/lightmetrica-v2)
-real constexpr operator"" _f(long double v) {
+real constexpr operator""_f(long double v) {
   return real(v);
 }
 real constexpr operator"" _f(unsigned long long v) {
   return real(v);
 }
 
-float32 constexpr operator"" _f32(long double v) {
+float32 constexpr operator""_f32(long double v) {
   return float32(v);
 }
 float32 constexpr operator"" _f32(unsigned long long v) {
diff --git a/taichi/common/types.h b/taichi/common/types.h
index b4728a6daa22d..89dbc7c2e2be9 100644
--- a/taichi/common/types.h
+++ b/taichi/common/types.h
@@ -38,14 +38,14 @@ using real = float32;
 
 // Float literal for both float32/64
 // (Learned from https://github.com/hi2p-perim/lightmetrica-v2)
-real constexpr operator"" _f(long double v) {
+real constexpr operator""_f(long double v) {
   return real(v);
 }
 real constexpr operator"" _f(unsigned long long v) {
   return real(v);
 }
 
-float32 constexpr operator"" _f32(long double v) {
+float32 constexpr operator""_f32(long double v) {
   return float32(v);
 }
 float32 constexpr operator"" _f32(unsigned long long v) {
diff --git a/taichi/math/linalg.h b/taichi/math/linalg.h
index 4d31bb5e7fb10..0f5ee6fd053c2 100644
--- a/taichi/math/linalg.h
+++ b/taichi/math/linalg.h
@@ -242,7 +242,10 @@ struct VectorND : public VectorNDBase<dim__, T, ISE> {
   }
 
   TI_FORCE_INLINE VectorND &operator=(const VectorND &o) {
-    memcpy(this, &o, sizeof(*this));
+    // memcpy(this, &o, sizeof(*this));
+    for (int i = 0; i < dim; ++i) {
+      entries[i] = o[i];
+    }
     return *this;
   }
 
diff --git a/taichi/runtime/cuda/jit_cuda.cpp b/taichi/runtime/cuda/jit_cuda.cpp
index 46860c562678c..eede36e02d99e 100644
--- a/taichi/runtime/cuda/jit_cuda.cpp
+++ b/taichi/runtime/cuda/jit_cuda.cpp
@@ -1,88 +1,153 @@
-
 #include "taichi/runtime/cuda/jit_cuda.h"
 #include "taichi/runtime/llvm/llvm_context.h"
+#include "taichi/util/file_sequence_writer.h"
+#include "taichi/runtime/program_impls/llvm/llvm_program.h"
 
+// === CHANGED SECTION: HEADER INCLUDES ===
+// Add headers for NPM and the specific passes we need.
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
+#include "llvm/Transforms/Scalar/IndVarSimplify.h"
+#include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
+#include "llvm/Transforms/Scalar/EarlyCSE.h"
+// === END OF CHANGED SECTION ===
 
 namespace taichi::lang {
 
 #if defined(TI_WITH_CUDA)
 
-JITModule *JITSessionCUDA ::add_module(std::unique_ptr<llvm::Module> M,
-                                       int max_reg) {
+// Helper function to check if runtime_initialize exists
+bool module_has_runtime_initialize(
+    const llvm::Module::FunctionListType &function_list) {
+  for (const auto &func : function_list) {
+    if (func.getName() == "runtime_initialize") {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Helper function to get a representative name for dumping files
+std::string moduleToDumpName(llvm::Module *M) {
+  std::string dumpName = M->getName().str();
+  if (M->getFunctionList().empty()) {
+    return dumpName;
+  }
+  if (!module_has_runtime_initialize(M->getFunctionList())) {
+    // If runtime_initialize is not present, it's likely a kernel-only module.
+    // Use the first function's name for a more descriptive dump file name.
+    dumpName = M->getFunctionList().begin()->getName().str();
+  }
+  return dumpName;
+}
+
+JITModule *JITSessionCUDA::add_module(std::unique_ptr<llvm::Module> M,
+                                      int max_reg) {
+  const char *dump_ir_env = std::getenv("TAICHI_DUMP_IR");
+  if (dump_ir_env) {
+    const std::string dumpOutDir = "/tmp/ir/";
+    std::filesystem::create_directories(dumpOutDir);
+    std::string dumpName = moduleToDumpName(M.get());
+    std::string filename = dumpOutDir + "/" + dumpName + "_before_ptx.ll";
+    std::error_code EC;
+    llvm::raw_fd_ostream dest_file(filename, EC);
+    if (!EC) {
+      M->print(dest_file, nullptr);
+    } else {
+      TI_WARN("Failed to dump LLVM IR to file {}: {}", filename, EC.message());
+    }
+  }
+
   auto ptx = compile_module_to_ptx(M);
-  if (this->config_.print_kernel_asm) {
+  if (this->config.print_kernel_asm) {
     static FileSequenceWriter writer("taichi_kernel_nvptx_{:04d}.ptx",
                                      "module NVPTX");
     writer.write(ptx);
   }
 
-  // TODO: figure out why using the guard leads to wrong tests results
-  // auto context_guard = CUDAContext::get_instance().get_guard();
-  CUDAContext::get_instance().make_current();
+  if (dump_ir_env) {
+    const std::string dumpOutDir = "/tmp/ptx/";
+    std::filesystem::create_directories(dumpOutDir);
+    std::string dumpName = moduleToDumpName(M.get());
+    std::string filename = dumpOutDir + "/" + dumpName + ".ptx";
+    std::ofstream out_file(filename);
+    if (out_file.is_open()) {
+      out_file << ptx;
+      out_file.close();
+      TI_INFO("PTX dumped to: {}", filename);
+    }
+  }
 
-  // Create module for object
+  const char *load_ptx_env = std::getenv("TAICHI_LOAD_PTX");
+  if (load_ptx_env) {
+    const std::string dumpOutDir = "/tmp/ptx/";
+    std::string dumpName = moduleToDumpName(M.get());
+    std::string filename = dumpOutDir + "/" + dumpName + ".ptx";
+    std::ifstream in_file(filename);
+    if (in_file.is_open()) {
+      TI_INFO("Loading PTX from file: {}", filename);
+      std::stringstream ptx_stream;
+      ptx_stream << in_file.rdbuf();
+      ptx = ptx_stream.str();
+      in_file.close();
+    } else {
+      TI_WARN("Failed to open PTX file for loading: {}", filename);
+    }
+  }
+  if (ptx.back() != '\0') {
+    ptx += '\0'; // Ensure null termination
+  }
+
+  CUDAContext::get_instance().make_current();
   void *cuda_module;
   TI_TRACE("PTX size: {:.2f}KB", ptx.size() / 1024.0);
   auto t = Time::get_time();
   TI_TRACE("Loading module...");
   [[maybe_unused]] auto _ = CUDAContext::get_instance().get_lock_guard();
 
-  constexpr int max_num_options = 8;
-  int num_options = 0;
-  uint32 options[max_num_options];
-  void *option_values[max_num_options];
+  std::vector<CUjit_option> options;
+  std::vector<void *> option_values;
+  unsigned int max_reg_uint = max_reg;
 
-  // Insert options
-  if (max_reg != 0) {
-    options[num_options] = CU_JIT_MAX_REGISTERS;
-    option_values[num_options] = &max_reg;
-    num_options++;
+  if (max_reg > 0) {
+    options.push_back(CU_JIT_MAX_REGISTERS);
+    option_values.push_back(&max_reg_uint);
   }
 
-  TI_ASSERT(num_options <= max_num_options);
-
-  CUDADriver::get_instance().module_load_data_ex(
-      &cuda_module, ptx.c_str(), num_options, options, option_values);
+  CUDADriver::get_instance().module_load_data_ex(&cuda_module, ptx.c_str(),
+                                                 options.size(),
+                                                 options.data(),
+                                                 option_values.data());
   TI_TRACE("CUDA module load time : {}ms", (Time::get_time() - t) * 1000);
 
   modules.push_back(std::make_unique<JITModuleCUDA>(cuda_module));
   return modules.back().get();
 }
 
+// Generates the PTX features string, e.g., "+ptx63"
 std::string cuda_mattrs() {
-  // TODO: upgrade to ptx78 as supported by LLVM 16
-  return "+ptx63";
+  return "+ptx" + std::to_string(CUDAContext::get_instance().get_ptx_version());
 }
 
-std::string convert(std::string new_name) {
-  // Evil C++ mangling on Windows will lead to "unsupported characters in
-  // symbol" error in LLVM PTX printer. Convert here.
-  for (int i = 0; i < (int)new_name.size(); i++) {
-    if (new_name[i] == '@') {
-      new_name.replace(i, 1, "_at_");
-    } else if (new_name[i] == '?') {
-      new_name.replace(i, 1, "_qm_");
-    } else if (new_name[i] == '$') {
-      new_name.replace(i, 1, "_dl_");
-    } else if (new_name[i] == '<') {
-      new_name.replace(i, 1, "_lb_");
-    } else if (new_name[i] == '>') {
-      new_name.replace(i, 1, "_rb_");
-    } else if (!std::isalpha(new_name[i]) && !std::isdigit(new_name[i]) &&
-               new_name[i] != '_' && new_name[i] != '.') {
-      new_name.replace(i, 1, "_xx_");
+// Mangles symbol names to be safe for PTX.
+std::string convert_name_for_ptx(std::string new_name) {
+  for (char &i : new_name) {
+    if (i == '@' || i == '?' || i == '$' || i == '<' || i == '>' ||
+        (!std::isalnum(i) && i != '_' && i != '.')) {
+      i = '_';
     }
   }
-  if (!new_name.empty())
-    TI_ASSERT(isalpha(new_name[0]) || new_name[0] == '_' || new_name[0] == '.');
+  if (!new_name.empty() && !isalpha(new_name[0]) && new_name[0] != '_' &&
+      new_name[0] != '.') {
+    new_name = "_" + new_name;
+  }
   return new_name;
 }
 
+// === CHANGED SECTION: FUNCTION REWRITTEN FOR NPM ===
 std::string JITSessionCUDA::compile_module_to_ptx(
     std::unique_ptr<llvm::Module> &module) {
   TI_AUTO_PROF
-  // Part of this function is borrowed from Halide::CodeGen_PTX_Dev.cpp
   if (llvm::verifyModule(*module, &llvm::errs())) {
     module->print(llvm::errs(), nullptr);
     TI_ERROR("LLVM Module broken");
@@ -90,107 +155,63 @@ std::string JITSessionCUDA::compile_module_to_ptx(
 
   using namespace llvm;
 
-  if (this->config_.print_kernel_llvm_ir) {
+  if (this->config.print_kernel_llvm_ir) {
     static FileSequenceWriter writer("taichi_kernel_cuda_llvm_ir_{:04d}.ll",
                                      "unoptimized LLVM IR (CUDA)");
     writer.write(module.get());
   }
 
-  for (auto &f : module->globals())
-    f.setName(convert(f.getName().str()));
-  for (auto &f : *module)
-    f.setName(convert(f.getName().str()));
+  for (auto &f : module->globals()) {
+    f.setName(convert_name_for_ptx(f.getName().str()));
+  }
+  for (auto &f : *module) {
+    f.setName(convert_name_for_ptx(f.getName().str()));
+  }
 
   llvm::Triple triple(module->getTargetTriple());
-
-  // Allocate target machine
   std::string err_str;
   const llvm::Target *target =
       TargetRegistry::lookupTarget(triple.str(), err_str);
   TI_ERROR_UNLESS(target, err_str);
 
   TargetOptions options;
-  if (this->config_.fast_math) {
+  if (this->config.fast_math) {
     options.AllowFPOpFusion = FPOpFusion::Fast;
-    // See NVPTXISelLowering.cpp
-    // Setting UnsafeFPMath true will result in approximations such as
-    // sqrt.approx in PTX for both f32 and f64
-    options.UnsafeFPMath = 1;
-    options.NoInfsFPMath = 1;
-    options.NoNaNsFPMath = 1;
-  } else {
-    options.AllowFPOpFusion = FPOpFusion::Strict;
-    options.UnsafeFPMath = 0;
-    options.NoInfsFPMath = 0;
-    options.NoNaNsFPMath = 0;
+    options.UnsafeFPMath = true;
+    options.NoInfsFPMath = true;
+    options.NoNaNsFPMath = true;
   }
-
-  options.HonorSignDependentRoundingFPMathOption = 0;
-  options.NoZerosInBSS = 0;
-  options.GuaranteedTailCallOpt = 0;
-
-#if LLVM_VERSION_MAJOR >= 18
-  const auto opt_level = llvm::CodeGenOptLevel::Aggressive;
-#else
-  const auto opt_level = llvm::CodeGenOpt::Aggressive;
-#endif
+  options.HonorSignDependentRoundingFPMathOption = false;
+  options.NoZerosInBSS = false;
+  options.GuaranteedTailCallOpt = false;
+  
+  // `CodeGenOpt::Aggressive` is removed. Optimizations are now controlled by PassBuilder.
   std::unique_ptr<TargetMachine> target_machine(target->createTargetMachine(
       triple.str(), CUDAContext::get_instance().get_mcpu(), cuda_mattrs(),
-      options, llvm::Reloc::PIC_, llvm::CodeModel::Small, opt_level));
-
-  TI_ERROR_UNLESS(target_machine.get(), "Could not allocate target machine!");
+      options, llvm::Reloc::PIC_, llvm::CodeModel::Small,
+      CodeGenOpt::None));
 
-  module->setTargetTriple(triple.str());
+  TI_ERROR_UNLESS(target_machine, "Could not allocate target machine!");
   module->setDataLayout(target_machine->createDataLayout());
 
-  // NVidia's libdevice library uses a __nvvm_reflect to choose
-  // how to handle denormalized numbers. (The pass replaces calls
-  // to __nvvm_reflect with a constant via a map lookup. The inliner
-  // pass then resolves these situations to fast code, often a single
-  // instruction per decision point.)
-  //
-  // The default is (more) IEEE like handling. FTZ mode flushes them
-  // to zero. (This may only apply to single-precision.)
-  //
-  // The libdevice documentation covers other options for math accuracy
-  // such as replacing division with multiply by the reciprocal and
-  // use of fused-multiply-add, but they do not seem to be controlled
-  // by this __nvvvm_reflect mechanism and may be flags to earlier compiler
-  // passes.
   const auto kFTZDenorms = 1;
-
-  // Insert a module flag for the FTZ handling.
   module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz",
                         kFTZDenorms);
-
   if (kFTZDenorms) {
     for (llvm::Function &fn : *module) {
-      /* nvptx-f32ftz was deprecated.
-       *
-       * https://github.com/llvm/llvm-project/commit/a4451d88ee456304c26d552749aea6a7f5154bde#diff-6fda74ef428299644e9f49a2b0994c0d850a760b89828f655030a114060d075a
-       */
       fn.addFnAttr("denormal-fp-math-f32", "preserve-sign");
-
-      // Use unsafe fp math for sqrt.approx instead of sqrt.rn
       fn.addFnAttr("unsafe-fp-math", "true");
     }
   }
 
-  // Create the new analysis manager
-  llvm::LoopAnalysisManager LAM;
-  llvm::FunctionAnalysisManager FAM;
-  llvm::CGSCCAnalysisManager CGAM;
-  llvm::ModuleAnalysisManager MAM;
-
-  // Create the new pass builder
-  llvm::PipelineTuningOptions PTO;
-  PTO.LoopInterleaving = false;
-  PTO.LoopVectorization = false;
-  PTO.SLPVectorization = true;
-  PTO.LoopUnrolling = false;
-  PTO.ForgetAllSCEVInLoopUnroll = true;
+  // === New Pass Manager Setup ===
+  LoopAnalysisManager LAM;
+  FunctionAnalysisManager FAM;
+  CGSCCAnalysisManager CGAM;
+  ModuleAnalysisManager MAM;
+  PassBuilder PB(target_machine.get());
 
-  llvm::PassBuilder PB(target_machine.get(), PTO);
+  FAM.registerPass([&] { return target_machine->getTargetIRAnalysis(); });
 
   PB.registerModuleAnalyses(MAM);
   PB.registerCGSCCAnalyses(CGAM);
@@ -198,54 +219,50 @@ std::string JITSessionCUDA::compile_module_to_ptx(
   PB.registerLoopAnalyses(LAM);
   PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
 
-  target_machine->registerPassBuilderCallbacks(PB);
-
-  llvm::ModulePassManager MPM =
-      PB.buildPerModuleDefaultPipeline(llvm::OptimizationLevel::O3);
-
+  // `PassManagerBuilder::OptLevel=3` is now `llvm::OptimizationLevel::O3`
+  OptimizationLevel opt_level = OptimizationLevel::O3;
+  
+  ModulePassManager MPM = PB.buildPerModuleDefaultPipeline(opt_level);
+
+  // Add the custom GEP optimization passes.
+  // These are FunctionPasses, so they need to be grouped in an FPM and then adapted.
+  FunctionPassManager FPM;
+  FPM.addPass(llvm::LoopStrengthReducePass());
+  FPM.addPass(llvm::IndVarSimplifyPass());
+  FPM.addPass(llvm::SeparateConstOffsetFromGEPPass(false));
+  FPM.addPass(llvm::EarlyCSEPass(true));
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+
+  // Setup the output stream for PTX
+  SmallString<0> outstr;
+  raw_svector_ostream ostream(outstr);
+  ostream.SetUnbuffered();
+  
+  target_machine->Options.MCOptions.AsmVerbose = true;
+  
+  // `CGFT_AssemblyFile` is now `CodeGenFileType::AssemblyFile`
+  if (target_machine->addPassesToEmitFile(MPM, ostream, nullptr,
+                                         CodeGenFileType::AssemblyFile, true)) {
+    TI_ERROR("Failed to set up passes to emit PTX source\n");
+  }
+  
+  // Run all passes
   {
     TI_PROFILER("llvm_module_pass");
     MPM.run(*module, MAM);
   }
 
-  if (llvm::verifyModule(*module, &llvm::errs())) {
-    module->print(llvm::errs(), nullptr);
-    TI_ERROR("LLVM Module broken");
-  }
-
-  if (this->config_.print_kernel_llvm_ir_optimized) {
+  if (this->config.print_kernel_llvm_ir_optimized) {
     static FileSequenceWriter writer(
         "taichi_kernel_cuda_llvm_ir_optimized_{:04d}.ll",
         "optimized LLVM IR (CUDA)");
     writer.write(module.get());
   }
 
-  llvm::SmallString<8> outstr;
-  raw_svector_ostream ostream(outstr);
-  ostream.SetUnbuffered();
-
-  llvm::legacy::PassManager LPM;
-  LPM.add(createTargetTransformInfoWrapperPass(
-      target_machine->getTargetIRAnalysis()));
-
-  // Override default to generate verbose assembly.
-  target_machine->Options.MCOptions.AsmVerbose = true;
-
-#if LLVM_VERSION_MAJOR >= 18
-  const auto file_type = llvm::CodeGenFileType::AssemblyFile;
-#else
-  const auto file_type = llvm::CGFT_AssemblyFile;
-#endif
-  bool fail = target_machine->addPassesToEmitFile(LPM, ostream, nullptr,
-                                                  file_type, true);
-
-  TI_ERROR_IF(fail, "Failed to set up passes to emit PTX source\n");
-  LPM.run(*module);
-
-  std::string buffer(outstr.begin(), outstr.end());
-  buffer.push_back(0);
-  return buffer;
+  return std::string(outstr.str());
 }
+// === END OF CHANGED SECTION ===
+
 
 std::unique_ptr<JITSession> create_llvm_jit_session_cuda(
     TaichiLLVMContext *tlctx,
@@ -261,8 +278,9 @@ std::unique_ptr<JITSession> create_llvm_jit_session_cuda(
     TaichiLLVMContext *tlctx,
     const CompileConfig &config,
     Arch arch) {
-  TI_NOT_IMPLEMENTED
+  TI_NOT_IMPLEMENTED;
+  return nullptr;
 }
 #endif
 
-}  // namespace taichi::lang
+}  // namespace taichi::lang
\ No newline at end of file
diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp
index a539d5697781f..2687b223275e7 100644
--- a/taichi/runtime/llvm/llvm_context.cpp
+++ b/taichi/runtime/llvm/llvm_context.cpp
@@ -39,6 +39,8 @@
 #include "llvm/Linker/Linker.h"
 #include "llvm/Demangle/Demangle.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/Transforms/IPO/StripDeadPrototypes.h"
+#include "llvm/Passes/PassBuilder.h"
 
 #include "taichi/util/lang_util.h"
 #include "taichi/jit/jit_session.h"
@@ -340,17 +342,20 @@ static void remove_useless_cuda_libdevice_functions(llvm::Module *module) {
 
 // Note: runtime_module = init_module < struct_module
 
-std::unique_ptr<llvm::Module> TaichiLLVMContext::clone_runtime_module() {
-  TI_AUTO_PROF
-  auto *mod = get_this_thread_runtime_module();
+std::unique_ptr<llvm::Module> TaichiLLVMContext::clone_runtime_module(
+    const std::string &name) {
+  TI_AUTO_PROF;
+  auto cloned = llvm::CloneModule(*runtime_module_.get());
+  cloned->setModuleIdentifier(name);
 
-  std::unique_ptr<llvm::Module> cloned;
-  {
-    TI_PROFILER("clone module");
-    cloned = llvm::CloneModule(*mod);
-  }
+  llvm::PassBuilder PB;
+  llvm::ModuleAnalysisManager MAM;
+  PB.registerModuleAnalyses(MAM);
+
+  llvm::ModulePassManager MPM;
+  MPM.addPass(llvm::StripDeadPrototypesPass());
 
-  TI_ASSERT(cloned != nullptr);
+  MPM.run(*cloned, MAM);
 
   return cloned;
 }

From de14f98dd815b248f23324d570ddc1adcdec7307 Mon Sep 17 00:00:00 2001
From: Tiffany Mintz <Tiffany.Mintz@amd.com>
Date: Tue, 19 Aug 2025 15:13:48 -0500
Subject: [PATCH 19/42] implementing error function and cuda updates; merging
 5449f722e796fac22d3b7a041488d77ac5e49b25 to
 649c58d7a7991080e5dae510be72f7c6ad528001 from johnnynunez/taichi master
 branch; some of the changes from these were captured in the previous commit
 to rocm/taichi

---
 .github/workflows/scripts/ti_build/entry.py |  17 ++-
 python/taichi/lang/ops.py                   |  19 +++
 python/taichi/math/mathimpl.py              |   4 +
 taichi/inc/unary_op.inc.h                   |   2 +
 taichi/ir/expression_ops.h                  |   2 +
 taichi/ir/ir_builder.cpp                    |  28 ++++-
 taichi/ir/ir_builder.h                      |   8 +-
 taichi/runtime/llvm/llvm_context_pass.h     | 123 ++++++++------------
 taichi/ui/common/window_base.cpp            |   1 +
 taichi/ui/ggui/gui.cpp                      |  17 ++-
 10 files changed, 131 insertions(+), 90 deletions(-)

diff --git a/.github/workflows/scripts/ti_build/entry.py b/.github/workflows/scripts/ti_build/entry.py
index ebeb6a08d2017..e96f96509d8db 100644
--- a/.github/workflows/scripts/ti_build/entry.py
+++ b/.github/workflows/scripts/ti_build/entry.py
@@ -33,7 +33,7 @@ def build_wheel(python: Command, pip: Command) -> None:
     Build the Taichi wheel
     """
 
-    #git.fetch("origin", "master", "--tags", "--force")
+    git.fetch("origin", "master", "--tags", "--force")
     proj_tags = []
     extra = []
 
@@ -52,11 +52,20 @@ def build_wheel(python: Command, pip: Command) -> None:
     elif wheel_tag:
         proj_tags.extend(["egg_info", f"--tag-build={wheel_tag}"])
 
-    if platform.system() == "Linux":
+    u = platform.uname()
+    if (u.system, u.machine) == ("Linux", "x86_64"):
         if is_manylinux2014():
             extra.extend(["-p", "manylinux2014_x86_64"])
         else:
             extra.extend(["-p", "manylinux_2_27_x86_64"])
+    elif (u.system, u.machine) in (("Linux", "arm64"), ("Linux", "aarch64")):
+        if is_manylinux2014():
+            extra.extend(["-p", "manylinux2014_aarch64"])
+        else:
+            extra.extend(["-p", "manylinux_2_27_aarch64"])
+    else:
+        extra.extend(["-p", "manylinux2014_x86_64"])
+
 
     python("setup.py", "clean")
     python("misc/make_changelog.py", "--ver", "origin/master", "--repo_dir", "./", "--save")
@@ -82,8 +91,8 @@ def setup_basic_build_env():
         setup_clang()
 
     setup_llvm()
-    if u.system in ("Linux", "Windows"):
-        # We support & test Vulkan shader debug printf on Linux && Windows
+    if u.system == "Linux":
+        # We support & test Vulkan shader debug printf on Linux
         # This is done through the validation layer
         from .vulkan import setup_vulkan
 
diff --git a/python/taichi/lang/ops.py b/python/taichi/lang/ops.py
index e259886d92002..81e98a4fb4f90 100644
--- a/python/taichi/lang/ops.py
+++ b/python/taichi/lang/ops.py
@@ -4,6 +4,7 @@
 from typing import Union
 
 import numpy as np
+import math
 from taichi._lib import core as _ti_core
 from taichi.lang import expr, impl
 from taichi.lang.exception import TaichiSyntaxError
@@ -555,6 +556,24 @@ def logical_not(a):
     """
     return _unary_operation(_ti_core.expr_logic_not, np.logical_not, a)
 
+def erf(a):
+    """The error function.
+    Args:
+        a (Union[:class:`~taichi.lang.expr.Expr`, :class:`~taichi.lang.matrix.Matrix`]): A number or a matrix.
+    Returns:
+        The error function of `a`.
+    """
+    return _unary_operation(_ti_core.expr_erf, math.erf, a)
+
+
+def erfc(a):
+    """The complement error function.
+    Args:
+        a (Union[:class:`~taichi.lang.expr.Expr`, :class:`~taichi.lang.matrix.Matrix`]): A number or a matrix.
+    Returns:
+        The complement error function of `a`.
+    """
+    return _unary_operation(_ti_core.expr_erfc, math.erfc, a)
 
 def random(dtype=float) -> Union[float, int]:
     """Return a single random float/integer according to the specified data type.
diff --git a/python/taichi/math/mathimpl.py b/python/taichi/math/mathimpl.py
index 85fb4565a5629..2ed6ad214ee2b 100644
--- a/python/taichi/math/mathimpl.py
+++ b/python/taichi/math/mathimpl.py
@@ -25,6 +25,8 @@
     sqrt,
     tan,
     tanh,
+    erf,
+    erfc,
 )
 from taichi.types import matrix, template, vector
 from taichi.types.primitive_types import f64, u32, u64
@@ -874,6 +876,8 @@ def clz(x):
     "step",
     "tan",
     "tanh",
+    "erf",
+    "erfc",
     "uvec2",
     "uvec3",
     "uvec4",
diff --git a/taichi/inc/unary_op.inc.h b/taichi/inc/unary_op.inc.h
index b7fa8abc109b9..d3102c50faf2d 100644
--- a/taichi/inc/unary_op.inc.h
+++ b/taichi/inc/unary_op.inc.h
@@ -24,3 +24,5 @@ PER_UNARY_OP(rsqrt)
 PER_UNARY_OP(bit_not)
 PER_UNARY_OP(logic_not)
 PER_UNARY_OP(undefined)
+PER_UNARY_OP(erf)
+PER_UNARY_OP(erfc)
\ No newline at end of file
diff --git a/taichi/ir/expression_ops.h b/taichi/ir/expression_ops.h
index 120b423cdda26..17504685a8d07 100644
--- a/taichi/ir/expression_ops.h
+++ b/taichi/ir/expression_ops.h
@@ -95,6 +95,8 @@ DEFINE_EXPRESSION_FUNC_UNARY(log)
 DEFINE_EXPRESSION_FUNC_UNARY(popcnt)
 DEFINE_EXPRESSION_FUNC_UNARY(clz)
 DEFINE_EXPRESSION_FUNC_UNARY(logic_not)
+DEFINE_EXPRESSION_FUNC_UNARY(erf)
+DEFINE_EXPRESSION_FUNC_UNARY(erfc)
 DEFINE_EXPRESSION_OP_UNARY(~, bit_not)
 DEFINE_EXPRESSION_OP_UNARY(-, neg)
 
diff --git a/taichi/ir/ir_builder.cpp b/taichi/ir/ir_builder.cpp
index b28412cd441f4..a7ab59a9571be 100644
--- a/taichi/ir/ir_builder.cpp
+++ b/taichi/ir/ir_builder.cpp
@@ -1,3 +1,4 @@
+
 #include "taichi/ir/ir_builder.h"
 #include "taichi/ir/statements.h"
 #include "taichi/common/logging.h"
@@ -129,6 +130,12 @@ ContinueStmt *IRBuilder::create_continue() {
   return insert(Stmt::make_typed<ContinueStmt>());
 }
 
+void IRBuilder::create_assert(Stmt *cond, const std::string &msg) {
+  std::vector<Stmt *> empty_args;
+  auto assert_stmt = Stmt::make_typed<AssertStmt>(cond, msg, empty_args);
+  insert(std::move(assert_stmt));
+}
+
 FuncCallStmt *IRBuilder::create_func_call(Function *func,
                                           const std::vector<Stmt *> &args) {
   return insert(Stmt::make_typed<FuncCallStmt>(func, args));
@@ -138,6 +145,12 @@ LoopIndexStmt *IRBuilder::get_loop_index(Stmt *loop, int index) {
   return insert(Stmt::make_typed<LoopIndexStmt>(loop, index));
 }
 
+ConstStmt *IRBuilder::get_bool(bool value) {
+  return insert(Stmt::make_typed<ConstStmt>(TypedConstant(
+      TypeFactory::get_instance().get_primitive_type(PrimitiveTypeID::u1),
+      value)));
+}
+
 ConstStmt *IRBuilder::get_int32(int32 value) {
   return insert(Stmt::make_typed<ConstStmt>(TypedConstant(
       TypeFactory::get_instance().get_primitive_type(PrimitiveTypeID::i32),
@@ -181,9 +194,10 @@ RandStmt *IRBuilder::create_rand(DataType value_type) {
 ArgLoadStmt *IRBuilder::create_arg_load(const std::vector<int> &arg_id,
                                         DataType dt,
                                         bool is_ptr,
-                                        int arg_depth) {
-  return insert(Stmt::make_typed<ArgLoadStmt>(arg_id, dt, is_ptr,
-                                              /*create_load*/ true, arg_depth));
+                                        int arg_depth,
+                                        bool create_load) {
+  return insert(Stmt::make_typed<ArgLoadStmt>(arg_id, dt, is_ptr, create_load,
+                                              arg_depth));
 }
 
 ReturnStmt *IRBuilder::create_return(Stmt *value) {
@@ -282,6 +296,14 @@ UnaryOpStmt *IRBuilder::create_clz(Stmt *value) {
   return insert(Stmt::make_typed<UnaryOpStmt>(UnaryOpType::clz, value));
 }
 
+UnaryOpStmt *IRBuilder::create_erf(Stmt *value) {
+  return insert(Stmt::make_typed<UnaryOpStmt>(UnaryOpType::erf, value));
+}
+
+UnaryOpStmt *IRBuilder::create_erfc(Stmt *value) {
+  return insert(Stmt::make_typed<UnaryOpStmt>(UnaryOpType::erfc, value));
+}
+
 BinaryOpStmt *IRBuilder::create_add(Stmt *l, Stmt *r) {
   return insert(Stmt::make_typed<BinaryOpStmt>(BinaryOpType::add, l, r));
 }
diff --git a/taichi/ir/ir_builder.h b/taichi/ir/ir_builder.h
index c585ed7be425e..0bd88e064aec2 100644
--- a/taichi/ir/ir_builder.h
+++ b/taichi/ir/ir_builder.h
@@ -1,3 +1,4 @@
+
 #pragma once
 
 #include "taichi/ir/ir.h"
@@ -121,6 +122,7 @@ class IRBuilder {
   IfStmt *create_if(Stmt *cond);
   WhileControlStmt *create_break();
   ContinueStmt *create_continue();
+  void create_assert(Stmt *cond, const std::string &msg);
 
   // Function.
   FuncCallStmt *create_func_call(Function *func,
@@ -130,6 +132,7 @@ class IRBuilder {
   LoopIndexStmt *get_loop_index(Stmt *loop, int index = 0);
 
   // Constants. TODO: add more types
+  ConstStmt *get_bool(bool value);
   ConstStmt *get_int32(int32 value);
   ConstStmt *get_int64(int64 value);
   ConstStmt *get_uint32(uint32 value);
@@ -148,7 +151,8 @@ class IRBuilder {
   ArgLoadStmt *create_arg_load(const std::vector<int> &arg_id,
                                DataType dt,
                                bool is_ptr,
-                               int arg_depth);
+                               int arg_depth,
+                               bool create_load = true);
   // Load kernel arguments.
   ArgLoadStmt *create_ndarray_arg_load(const std::vector<int> &arg_id,
                                        DataType dt,
@@ -181,6 +185,8 @@ class IRBuilder {
   UnaryOpStmt *create_log(Stmt *value);
   UnaryOpStmt *create_popcnt(Stmt *value);
   UnaryOpStmt *create_clz(Stmt *value);
+  UnaryOpStmt *create_erf(Stmt *value);
+  UnaryOpStmt *create_erfc(Stmt *value);
 
   // Binary operations. Returns the result.
   BinaryOpStmt *create_add(Stmt *l, Stmt *r);
diff --git a/taichi/runtime/llvm/llvm_context_pass.h b/taichi/runtime/llvm/llvm_context_pass.h
index 28044d00c1a32..e68cb5321a6a3 100644
--- a/taichi/runtime/llvm/llvm_context_pass.h
+++ b/taichi/runtime/llvm/llvm_context_pass.h
@@ -1,24 +1,17 @@
 #pragma once
 
 #include "llvm/IR/LLVMContext.h"
-// #include "llvm/IR/LegacyPassManager.h" // Obsolete: Removed
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Function.h"
-// #include "llvm/Pass.h" // Obsolete: Base classes for LPM are replaced
+#include "llvm/Pass.h"
 #include "llvm/IR/Module.h"
-// #include "llvm/Transforms/IPO.h" // Obsolete: Part of LPM
-// #include "llvm/Transforms/IPO/PassManagerBuilder.h" // Obsolete: Removed
+#include "llvm/Transforms/IPO.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 
-// === CHANGED SECTION: HEADER INCLUDES ===
-// New includes for the New Pass Manager (NPM) base classes.
-#include "llvm/IR/PassManager.h"
-// === END OF CHANGED SECTION ===
-
-
 #if defined(TI_WITH_AMDGPU)
 #include "taichi/rhi/amdgpu/amdgpu_context.h"
 #endif
@@ -27,19 +20,15 @@ namespace taichi {
 namespace lang {
 using namespace llvm;
 
-// === CHANGED SECTION: PASS DEFINITION ===
-// The pass now inherits from `PassInfoMixin` and `ModulePass` is replaced by an
-// interface that works with the New Pass Manager. The core logic is wrapped
-// inside a `run` method.
-struct AddStructForFuncPass : public PassInfoMixin<AddStructForFuncPass> {
+struct AddStructForFuncPass : public ModulePass {
+  static inline char ID{0};
   std::string func_name_;
   int tls_size_;
-
-  AddStructForFuncPass(std::string func_name, int tls_size)
-      : func_name_(std::move(func_name)), tls_size_(tls_size) {
+  AddStructForFuncPass(std::string func_name, int tls_size) : ModulePass(ID) {
+    func_name_ = func_name;
+    tls_size_ = tls_size;
   }
-
-  PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM) {
+  bool runOnModule(llvm::Module &M) override {
     auto struct_for_func = M.getFunction("parallel_struct_for");
     auto &llvm_context = M.getContext();
     auto value_map = llvm::ValueToValueMapTy();
@@ -83,26 +72,19 @@ struct AddStructForFuncPass : public PassInfoMixin<AddStructForFuncPass> {
     gep->replaceAllUsesWith(new_gep);
     gep->eraseFromParent();
     alloca->eraseFromParent();
-
-    // In NPM, we must return which analyses are preserved.
-    // Since this pass modifies the IR, we return `None` to indicate that
-    // all analyses are invalidated.
-    return PreservedAnalyses::none();
+    return false;
   }
 };
-// === END OF CHANGED SECTION ===
-
 
 #if defined(TI_WITH_AMDGPU)
-
-// === CHANGED SECTION: PASS DEFINITION ===
-// `FunctionPass` is replaced by a modern NPM-compatible interface.
-struct AMDGPUConvertAllocaInstAddressSpacePass : public PassInfoMixin<AMDGPUConvertAllocaInstAddressSpacePass> {
-  PreservedAnalyses run(llvm::Function &f, llvm::FunctionAnalysisManager &AM) {
+struct AMDGPUConvertAllocaInstAddressSpacePass : public FunctionPass {
+  static inline char ID{0};
+  AMDGPUConvertAllocaInstAddressSpacePass() : FunctionPass(ID) {
+  }
+  bool runOnFunction(llvm::Function &f) override {
     f.addFnAttr("target-cpu",
                 "gfx" + AMDGPUContext::get_instance().get_mcpu().substr(3, 4));
     f.addFnAttr("target-features", "");
-    bool changed = false;
     for (auto &bb : f) {
       std::vector<AllocaInst *> alloca_inst_vec;
       for (Instruction &inst : bb) {
@@ -113,9 +95,6 @@ struct AMDGPUConvertAllocaInstAddressSpacePass : public PassInfoMixin<AMDGPUConv
         }
         alloca_inst_vec.push_back(now_alloca);
       }
-      if (!alloca_inst_vec.empty()) {
-          changed = true;
-      }
       for (auto &allocainst : alloca_inst_vec) {
         auto alloca_type = allocainst->getAllocatedType();
         llvm::IRBuilder<> builder(allocainst);
@@ -130,20 +109,20 @@ struct AMDGPUConvertAllocaInstAddressSpacePass : public PassInfoMixin<AMDGPUConv
         allocainst->eraseFromParent();
       }
     }
-    return changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+    return false;
   }
 };
-// === END OF CHANGED SECTION ===
 
-
-// === CHANGED SECTION: PASS DEFINITION ===
-struct AMDGPUAddStructForFuncPass : public PassInfoMixin<AMDGPUAddStructForFuncPass> {
+struct AMDGPUAddStructForFuncPass : public ModulePass {
+  static inline char ID{0};
   std::string func_name_;
   int tls_size_;
   AMDGPUAddStructForFuncPass(std::string func_name, int tls_size)
-      : func_name_(std::move(func_name)), tls_size_(tls_size) {
+      : ModulePass(ID) {
+    func_name_ = func_name;
+    tls_size_ = tls_size;
   }
-  PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM) {
+  bool runOnModule(llvm::Module &M) override {
     auto struct_for_func = M.getFunction("parallel_struct_for");
     auto &llvm_context = M.getContext();
     auto value_map = llvm::ValueToValueMapTy();
@@ -200,16 +179,15 @@ struct AMDGPUAddStructForFuncPass : public PassInfoMixin<AMDGPUAddStructForFuncP
     gep->eraseFromParent();
     cast->eraseFromParent();
     alloca->eraseFromParent();
-    return PreservedAnalyses::none();
+    return false;
   }
 };
-// === END OF CHANGED SECTION ===
-
 
-// === CHANGED SECTION: PASS DEFINITION ===
-struct AMDGPUConvertFunctionBodyAllocsAddressSpacePass : public PassInfoMixin<AMDGPUConvertFunctionBodyAllocsAddressSpacePass> {
-  PreservedAnalyses run(llvm::Function &f, llvm::FunctionAnalysisManager &AM) {
-    bool changed = false;
+struct AMDGPUConvertFunctionBodyAllocsAddressSpacePass : public FunctionPass {
+  static inline char ID{0};
+  AMDGPUConvertFunctionBodyAllocsAddressSpacePass() : FunctionPass(ID) {
+  }
+  bool runOnFunction(llvm::Function &f) override {
     for (auto &bb : f) {
       if (bb.getName() != "allocs")
         continue;
@@ -223,9 +201,6 @@ struct AMDGPUConvertFunctionBodyAllocsAddressSpacePass : public PassInfoMixin<AM
         }
         alloca_inst_vec.push_back(now_alloca);
       }
-      if (!alloca_inst_vec.empty()) {
-          changed = true;
-      }
       for (auto &allocainst : alloca_inst_vec) {
         auto alloca_type = allocainst->getAllocatedType();
         llvm::IRBuilder<> builder(allocainst);
@@ -237,16 +212,15 @@ struct AMDGPUConvertFunctionBodyAllocsAddressSpacePass : public PassInfoMixin<AM
         allocainst->eraseFromParent();
       }
     }
-    return changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+    return false;
   }
 };
-// === END OF CHANGED SECTION ===
-
 
-// === CHANGED SECTION: PASS DEFINITION ===
-struct AMDGPUConvertFuncParamAddressSpacePass : public PassInfoMixin<AMDGPUConvertFuncParamAddressSpacePass> {
-  PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM) {
-    bool changed = false;
+struct AMDGPUConvertFuncParamAddressSpacePass : public ModulePass {
+  static inline char ID{0};
+  AMDGPUConvertFuncParamAddressSpacePass() : ModulePass(ID) {
+  }
+  bool runOnModule(llvm::Module &M) override {
     for (auto &f : M) {
       bool is_kernel = false;
       const std::string func_name = f.getName().str();
@@ -260,30 +234,28 @@ struct AMDGPUConvertFuncParamAddressSpacePass : public PassInfoMixin<AMDGPUConve
         // default value is 1,1024.
         f.addFnAttr("amdgpu-flat-work-group-size", "1, 1024");
         is_kernel = true;
-        changed = true;
       }
-      if (!is_kernel && !f.isDeclaration()) {
+      if (!is_kernel && !f.isDeclaration())
         f.setLinkage(llvm::Function::PrivateLinkage);
-        changed = true;
-      }
     }
     std::vector<llvm::Function *> kernel_function;
     for (auto &f : M) {
       if (f.getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL)
         kernel_function.push_back(&f);
     }
-    if (!kernel_function.empty()) {
-        changed = true;
-    }
     for (auto &f : kernel_function) {
       llvm::FunctionType *func_type = f->getFunctionType();
       std::vector<llvm::Type *> new_func_params;
       for (auto &arg : f->args()) {
-        if (arg.getType()->isPointerTy()) { // Modern way to check for pointer type
-          // The old getWithSamePointeeType is deprecated.
-          // The modern way is to get the pointee type and create a new pointer type.
-          llvm::Type* pointee_type = llvm::dyn_cast<llvm::PointerType>(arg.getType())->getNonOpaquePointerElementType();
-          auto new_type = llvm::PointerType::get(pointee_type, unsigned(1));
+        if (arg.getType()->getTypeID() == llvm::Type::PointerTyID) {
+          // This is a temporary LLVM interface to handle transition from typed
+          // pointer to opaque pointer In the future, if we only clang++ > 14,
+          // we can compeletely comply to opaque pointer and replace the
+          // following code with llvm::PointerType::get(M.getContext(),
+          // usigned(1))
+          auto new_type = llvm::PointerType::getWithSamePointeeType(
+              llvm::dyn_cast<llvm::PointerType>(arg.getType()), unsigned(1));
+
           new_func_params.push_back(new_type);
         } else {
           new_func_params.push_back(arg.getType());
@@ -306,10 +278,10 @@ struct AMDGPUConvertFuncParamAddressSpacePass : public PassInfoMixin<AMDGPUConve
       for (llvm::Function::arg_iterator I = f->arg_begin(), E = f->arg_end(),
                                         I2 = new_func->arg_begin();
            I != E; ++I, ++I2) {
-        if (I->getType()->isPointerTy()) { // Modern check
+        if (I->getType()->getTypeID() == llvm::Type::PointerTyID) {
           auto &front_bb = new_func->getBasicBlockList().front();
           llvm::Instruction *addrspacecast =
-              new AddrSpaceCastInst(&*I2, I->getType());
+              new AddrSpaceCastInst(I2, I->getType());
           front_bb.getInstList().insertAfter(front_bb.getFirstInsertionPt(),
                                              addrspacecast);
           I->replaceAllUsesWith(addrspacecast);
@@ -322,10 +294,9 @@ struct AMDGPUConvertFuncParamAddressSpacePass : public PassInfoMixin<AMDGPUConve
 
       f->eraseFromParent();
     }
-    return changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+    return false;
   }
 };
-// === END OF CHANGED SECTION ===
 
 #endif
 
diff --git a/taichi/ui/common/window_base.cpp b/taichi/ui/common/window_base.cpp
index 10af909476ee6..16bb9a3fc25e8 100644
--- a/taichi/ui/common/window_base.cpp
+++ b/taichi/ui/common/window_base.cpp
@@ -1,5 +1,6 @@
 #include "taichi/ui/common/window_base.h"
 #include "taichi/rhi/common/window_system.h"
+#include "taichi/ui/utils/utils.h"
 
 namespace taichi::ui {
 
diff --git a/taichi/ui/ggui/gui.cpp b/taichi/ui/ggui/gui.cpp
index f5bc10833880b..994ba9e2613bc 100644
--- a/taichi/ui/ggui/gui.cpp
+++ b/taichi/ui/ggui/gui.cpp
@@ -44,7 +44,9 @@ Gui::Gui(AppContext *app_context, SwapChain *swap_chain, TaichiWindow *window) {
 
 void Gui::init_render_resources(VkRenderPass render_pass) {
   ImGui_ImplVulkan_LoadFunctions(
-      load_vk_function_for_gui);  // this is because we're using volk.
+      VK_API_VERSION_1_0,        // or app_context_->config.vk_api_version
+      load_vk_function_for_gui,  // this is because we're using volk.
+      nullptr);
 
   auto &device =
       static_cast<taichi::lang::vulkan::VulkanDevice &>(app_context_->device());
@@ -60,7 +62,9 @@ void Gui::init_render_resources(VkRenderPass render_pass) {
   init_info.Allocator = VK_NULL_HANDLE;
   init_info.MinImageCount = swap_chain_->surface().get_image_count();
   init_info.ImageCount = swap_chain_->surface().get_image_count();
-  ImGui_ImplVulkan_Init(&init_info, render_pass);
+  // new signature takes only the struct
+  init_info.RenderPass = render_pass;
+  ImGui_ImplVulkan_Init(&init_info);
   render_pass_ = render_pass;
 
   // Upload Fonts
@@ -68,15 +72,16 @@ void Gui::init_render_resources(VkRenderPass render_pass) {
     auto stream = device.get_graphics_stream();
     auto [cmd_list, res] = stream->new_command_list_unique();
     assert(res == RhiResult::success && "Failed to allocate command list");
-    VkCommandBuffer command_buffer =
+    [[maybe_unused]] VkCommandBuffer command_buffer =
         static_cast<VulkanCommandList *>(cmd_list.get())
             ->vk_command_buffer()
             ->buffer;
 
-    ImGui_ImplVulkan_CreateFontsTexture(command_buffer);
+    // ≥ 1.90: the helper records its own commands
+    ImGui_ImplVulkan_CreateFontsTexture();
 
     stream->submit_synced(cmd_list.get());
-    ImGui_ImplVulkan_DestroyFontUploadObjects();
+    ImGui_ImplVulkan_DestroyFontsTexture();
   }
 
   prepare_for_next_frame();
@@ -255,4 +260,4 @@ bool Gui::is_empty() {
 
 }  // namespace vulkan
 
-}  // namespace taichi::ui
+}  // namespace taichi::ui
\ No newline at end of file

From c984b3c98ec12f7e1517afc0cba4336fd71028ac Mon Sep 17 00:00:00 2001
From: Tiffany Mintz <Tiffany.Mintz@amd.com>
Date: Wed, 20 Aug 2025 10:08:17 -0500
Subject: [PATCH 20/42] removing updates for blackwell

---
 .github/workflows/scripts/ti_build/entry.py | 11 +----------
 python/taichi/lang/ops.py                   |  2 ++
 taichi/ui/common/window_base.cpp            |  1 -
 taichi/ui/ggui/gui.cpp                      |  5 +++--
 4 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/scripts/ti_build/entry.py b/.github/workflows/scripts/ti_build/entry.py
index e96f96509d8db..570dc995dee13 100644
--- a/.github/workflows/scripts/ti_build/entry.py
+++ b/.github/workflows/scripts/ti_build/entry.py
@@ -52,20 +52,11 @@ def build_wheel(python: Command, pip: Command) -> None:
     elif wheel_tag:
         proj_tags.extend(["egg_info", f"--tag-build={wheel_tag}"])
 
-    u = platform.uname()
-    if (u.system, u.machine) == ("Linux", "x86_64"):
+    if platform.system() == "Linux":
         if is_manylinux2014():
             extra.extend(["-p", "manylinux2014_x86_64"])
         else:
             extra.extend(["-p", "manylinux_2_27_x86_64"])
-    elif (u.system, u.machine) in (("Linux", "arm64"), ("Linux", "aarch64")):
-        if is_manylinux2014():
-            extra.extend(["-p", "manylinux2014_aarch64"])
-        else:
-            extra.extend(["-p", "manylinux_2_27_aarch64"])
-    else:
-        extra.extend(["-p", "manylinux2014_x86_64"])
-
 
     python("setup.py", "clean")
     python("misc/make_changelog.py", "--ver", "origin/master", "--repo_dir", "./", "--save")
diff --git a/python/taichi/lang/ops.py b/python/taichi/lang/ops.py
index 81e98a4fb4f90..0df416f239870 100644
--- a/python/taichi/lang/ops.py
+++ b/python/taichi/lang/ops.py
@@ -1502,4 +1502,6 @@ def min(*args):  # pylint: disable=W0622
     "select",
     "abs",
     "pow",
+    "erf",
+    "erfc",
 ]
diff --git a/taichi/ui/common/window_base.cpp b/taichi/ui/common/window_base.cpp
index 16bb9a3fc25e8..10af909476ee6 100644
--- a/taichi/ui/common/window_base.cpp
+++ b/taichi/ui/common/window_base.cpp
@@ -1,6 +1,5 @@
 #include "taichi/ui/common/window_base.h"
 #include "taichi/rhi/common/window_system.h"
-#include "taichi/ui/utils/utils.h"
 
 namespace taichi::ui {
 
diff --git a/taichi/ui/ggui/gui.cpp b/taichi/ui/ggui/gui.cpp
index 994ba9e2613bc..13108ceb65f02 100644
--- a/taichi/ui/ggui/gui.cpp
+++ b/taichi/ui/ggui/gui.cpp
@@ -1,3 +1,4 @@
+
 #include "gui.h"
 #include "taichi/ui/ggui/swap_chain.h"
 #include "taichi/ui/ggui/app_context.h"
@@ -72,7 +73,7 @@ void Gui::init_render_resources(VkRenderPass render_pass) {
     auto stream = device.get_graphics_stream();
     auto [cmd_list, res] = stream->new_command_list_unique();
     assert(res == RhiResult::success && "Failed to allocate command list");
-    [[maybe_unused]] VkCommandBuffer command_buffer =
+    VkCommandBuffer command_buffer =
         static_cast<VulkanCommandList *>(cmd_list.get())
             ->vk_command_buffer()
             ->buffer;
@@ -260,4 +261,4 @@ bool Gui::is_empty() {
 
 }  // namespace vulkan
 
-}  // namespace taichi::ui
\ No newline at end of file
+}  // namespace taichi::ui

From f5118a709460070c89b41692d2ce2532f8b6fb04 Mon Sep 17 00:00:00 2001
From: Tiffany Mintz <Tiffany.Mintz@amd.com>
Date: Wed, 20 Aug 2025 10:21:48 -0500
Subject: [PATCH 21/42] removing blackwell updates; restoring window_base.cpp
 include

---
 taichi/runtime/cuda/jit_cuda.cpp | 75 ++++++++++++--------------------
 taichi/ui/common/window_base.cpp |  1 +
 2 files changed, 30 insertions(+), 46 deletions(-)

diff --git a/taichi/runtime/cuda/jit_cuda.cpp b/taichi/runtime/cuda/jit_cuda.cpp
index eede36e02d99e..958d98bdbf58b 100644
--- a/taichi/runtime/cuda/jit_cuda.cpp
+++ b/taichi/runtime/cuda/jit_cuda.cpp
@@ -3,14 +3,17 @@
 #include "taichi/util/file_sequence_writer.h"
 #include "taichi/runtime/program_impls/llvm/llvm_program.h"
 
-// === CHANGED SECTION: HEADER INCLUDES ===
-// Add headers for NPM and the specific passes we need.
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
 #include "llvm/Transforms/Scalar/IndVarSimplify.h"
 #include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
 #include "llvm/Transforms/Scalar/EarlyCSE.h"
-// === END OF CHANGED SECTION ===
+#include "llvm/Target/TargetMachine.h"
+
+// This is the crucial include for CUDA driver types like CUjit_option
+#include <cuda.h>
+#include <string>
+#include <vector>
 
 namespace taichi::lang {
 
@@ -34,8 +37,6 @@ std::string moduleToDumpName(llvm::Module *M) {
     return dumpName;
   }
   if (!module_has_runtime_initialize(M->getFunctionList())) {
-    // If runtime_initialize is not present, it's likely a kernel-only module.
-    // Use the first function's name for a more descriptive dump file name.
     dumpName = M->getFunctionList().begin()->getName().str();
   }
   return dumpName;
@@ -59,7 +60,7 @@ JITModule *JITSessionCUDA::add_module(std::unique_ptr<llvm::Module> M,
   }
 
   auto ptx = compile_module_to_ptx(M);
-  if (this->config.print_kernel_asm) {
+  if (this->config_.print_kernel_asm) {
     static FileSequenceWriter writer("taichi_kernel_nvptx_{:04d}.ptx",
                                      "module NVPTX");
     writer.write(ptx);
@@ -94,8 +95,8 @@ JITModule *JITSessionCUDA::add_module(std::unique_ptr<llvm::Module> M,
       TI_WARN("Failed to open PTX file for loading: {}", filename);
     }
   }
-  if (ptx.back() != '\0') {
-    ptx += '\0'; // Ensure null termination
+  if (ptx.empty() || ptx.back() != '\0') {
+    ptx += '\0';
   }
 
   CUDAContext::get_instance().make_current();
@@ -111,7 +112,7 @@ JITModule *JITSessionCUDA::add_module(std::unique_ptr<llvm::Module> M,
 
   if (max_reg > 0) {
     options.push_back(CU_JIT_MAX_REGISTERS);
-    option_values.push_back(&max_reg_uint);
+    option_values.push_back(reinterpret_cast<void *>(&max_reg_uint));
   }
 
   CUDADriver::get_instance().module_load_data_ex(&cuda_module, ptx.c_str(),
@@ -124,12 +125,11 @@ JITModule *JITSessionCUDA::add_module(std::unique_ptr<llvm::Module> M,
   return modules.back().get();
 }
 
-// Generates the PTX features string, e.g., "+ptx63"
 std::string cuda_mattrs() {
-  return "+ptx" + std::to_string(CUDAContext::get_instance().get_ptx_version());
+  return "+ptx" +
+         std::to_string(CUDAContext::get_instance().get_mcpu_version());
 }
 
-// Mangles symbol names to be safe for PTX.
 std::string convert_name_for_ptx(std::string new_name) {
   for (char &i : new_name) {
     if (i == '@' || i == '?' || i == '$' || i == '<' || i == '>' ||
@@ -144,7 +144,6 @@ std::string convert_name_for_ptx(std::string new_name) {
   return new_name;
 }
 
-// === CHANGED SECTION: FUNCTION REWRITTEN FOR NPM ===
 std::string JITSessionCUDA::compile_module_to_ptx(
     std::unique_ptr<llvm::Module> &module) {
   TI_AUTO_PROF
@@ -155,7 +154,7 @@ std::string JITSessionCUDA::compile_module_to_ptx(
 
   using namespace llvm;
 
-  if (this->config.print_kernel_llvm_ir) {
+  if (this->config_.print_kernel_llvm_ir) {
     static FileSequenceWriter writer("taichi_kernel_cuda_llvm_ir_{:04d}.ll",
                                      "unoptimized LLVM IR (CUDA)");
     writer.write(module.get());
@@ -175,7 +174,7 @@ std::string JITSessionCUDA::compile_module_to_ptx(
   TI_ERROR_UNLESS(target, err_str);
 
   TargetOptions options;
-  if (this->config.fast_math) {
+  if (this->config_.fast_math) {
     options.AllowFPOpFusion = FPOpFusion::Fast;
     options.UnsafeFPMath = true;
     options.NoInfsFPMath = true;
@@ -184,12 +183,12 @@ std::string JITSessionCUDA::compile_module_to_ptx(
   options.HonorSignDependentRoundingFPMathOption = false;
   options.NoZerosInBSS = false;
   options.GuaranteedTailCallOpt = false;
-  
-  // `CodeGenOpt::Aggressive` is removed. Optimizations are now controlled by PassBuilder.
+
   std::unique_ptr<TargetMachine> target_machine(target->createTargetMachine(
       triple.str(), CUDAContext::get_instance().get_mcpu(), cuda_mattrs(),
       options, llvm::Reloc::PIC_, llvm::CodeModel::Small,
-      CodeGenOpt::None));
+      config_.opt_level > 0 ? llvm::CodeGenOptLevel::Default
+                           : llvm::CodeGenOptLevel::None));
 
   TI_ERROR_UNLESS(target_machine, "Could not allocate target machine!");
   module->setDataLayout(target_machine->createDataLayout());
@@ -204,7 +203,6 @@ std::string JITSessionCUDA::compile_module_to_ptx(
     }
   }
 
-  // === New Pass Manager Setup ===
   LoopAnalysisManager LAM;
   FunctionAnalysisManager FAM;
   CGSCCAnalysisManager CGAM;
@@ -219,40 +217,36 @@ std::string JITSessionCUDA::compile_module_to_ptx(
   PB.registerLoopAnalyses(LAM);
   PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
 
-  // `PassManagerBuilder::OptLevel=3` is now `llvm::OptimizationLevel::O3`
   OptimizationLevel opt_level = OptimizationLevel::O3;
-  
-  ModulePassManager MPM = PB.buildPerModuleDefaultPipeline(opt_level);
+  ModulePassManager MPM;
+  if (config_.opt_level > 0) {
+    MPM = PB.buildPerModuleDefaultPipeline(opt_level);
+  }
 
-  // Add the custom GEP optimization passes.
-  // These are FunctionPasses, so they need to be grouped in an FPM and then adapted.
   FunctionPassManager FPM;
-  FPM.addPass(llvm::LoopStrengthReducePass());
-  FPM.addPass(llvm::IndVarSimplifyPass());
-  FPM.addPass(llvm::SeparateConstOffsetFromGEPPass(false));
-  FPM.addPass(llvm::EarlyCSEPass(true));
+  FPM.addPass(createFunctionToLoopPassAdaptor(LoopStrengthReducePass()));
+  FPM.addPass(createFunctionToLoopPassAdaptor(IndVarSimplifyPass()));
+  FPM.addPass(SeparateConstOffsetFromGEPPass(false));
+  FPM.addPass(EarlyCSEPass(true));
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
 
-  // Setup the output stream for PTX
   SmallString<0> outstr;
   raw_svector_ostream ostream(outstr);
   ostream.SetUnbuffered();
-  
+
   target_machine->Options.MCOptions.AsmVerbose = true;
-  
-  // `CGFT_AssemblyFile` is now `CodeGenFileType::AssemblyFile`
+
   if (target_machine->addPassesToEmitFile(MPM, ostream, nullptr,
                                          CodeGenFileType::AssemblyFile, true)) {
     TI_ERROR("Failed to set up passes to emit PTX source\n");
   }
-  
-  // Run all passes
+
   {
     TI_PROFILER("llvm_module_pass");
     MPM.run(*module, MAM);
   }
 
-  if (this->config.print_kernel_llvm_ir_optimized) {
+  if (this->config_.print_kernel_llvm_ir_optimized) {
     static FileSequenceWriter writer(
         "taichi_kernel_cuda_llvm_ir_optimized_{:04d}.ll",
         "optimized LLVM IR (CUDA)");
@@ -261,26 +255,15 @@ std::string JITSessionCUDA::compile_module_to_ptx(
 
   return std::string(outstr.str());
 }
-// === END OF CHANGED SECTION ===
-
 
 std::unique_ptr<JITSession> create_llvm_jit_session_cuda(
     TaichiLLVMContext *tlctx,
     const CompileConfig &config,
     Arch arch) {
   TI_ASSERT(arch == Arch::cuda);
-  // https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#data-layout
   auto data_layout = TaichiLLVMContext::get_data_layout(arch);
   return std::make_unique<JITSessionCUDA>(tlctx, config, data_layout);
 }
-#else
-std::unique_ptr<JITSession> create_llvm_jit_session_cuda(
-    TaichiLLVMContext *tlctx,
-    const CompileConfig &config,
-    Arch arch) {
-  TI_NOT_IMPLEMENTED;
-  return nullptr;
-}
 #endif
 
 }  // namespace taichi::lang
\ No newline at end of file
diff --git a/taichi/ui/common/window_base.cpp b/taichi/ui/common/window_base.cpp
index 10af909476ee6..16bb9a3fc25e8 100644
--- a/taichi/ui/common/window_base.cpp
+++ b/taichi/ui/common/window_base.cpp
@@ -1,5 +1,6 @@
 #include "taichi/ui/common/window_base.h"
 #include "taichi/rhi/common/window_system.h"
+#include "taichi/ui/utils/utils.h"
 
 namespace taichi::ui {
 

From 78d92136cbb7ea45c05fe7562b10f494c659bbe3 Mon Sep 17 00:00:00 2001
From: Tiffany Mintz <Tiffany.Mintz@amd.com>
Date: Wed, 20 Aug 2025 15:59:14 -0500
Subject: [PATCH 22/42] additional cuda updates for llvm20; merging from
 8ca16de9a24e82baaed1deac80f71bd541c131ca to
 add2df35782768f5ad243cacef0922ec00890ba5 from johnnynunez/taichi master
 branch

---
 taichi/codegen/cuda/codegen_cuda.cpp | 67 ++++++++++++++++------------
 1 file changed, 38 insertions(+), 29 deletions(-)

diff --git a/taichi/codegen/cuda/codegen_cuda.cpp b/taichi/codegen/cuda/codegen_cuda.cpp
index a88be5ef9b337..bdfc70e9ef342 100644
--- a/taichi/codegen/cuda/codegen_cuda.cpp
+++ b/taichi/codegen/cuda/codegen_cuda.cpp
@@ -17,6 +17,7 @@
 #include "taichi/ir/analysis.h"
 #include "taichi/ir/transforms.h"
 #include "taichi/codegen/codegen_utils.h"
+#include "llvm/Config/llvm-config.h"
 
 namespace taichi::lang {
 
@@ -65,19 +66,17 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
                             const std::vector<llvm::Value *> &values) {
     auto stype = llvm::StructType::get(*llvm_context, types, false);
     auto value_arr = builder->CreateAlloca(stype);
+
     for (int i = 0; i < values.size(); i++) {
       auto value_ptr = builder->CreateGEP(
           stype, value_arr, {tlctx->get_constant(0), tlctx->get_constant(i)});
       builder->CreateStore(values[i], value_ptr);
     }
-    // === CHANGED SECTION: LLVM API CALL ===
-    // The call to `llvm::Type::getInt8PtrTy(*llvm_context)` was replaced with
-    // the modern `llvm::Type::getPointerTy()`, which returns an opaque pointer.
     return LLVMModuleBuilder::call(
         builder.get(), "vprintf",
         builder->CreateGlobalStringPtr(format, "format_string"),
-        builder->CreateBitCast(value_arr, llvm::PointerType::get(*llvm_context, 0)));
-    // === END OF CHANGED SECTION ===
+        builder->CreateBitCast(value_arr,
+                               llvm::PointerType::get(*llvm_context, 0)));
   }
 
   std::tuple<llvm::Value *, llvm::Type *> create_value_and_type(
@@ -344,6 +343,8 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
     UNARY_STD(sgn)
     UNARY_STD(acos)
     UNARY_STD(asin)
+    UNARY_STD(erf)
+    UNARY_STD(erfc)
     else {
       TI_P(unary_op_type_name(op));
       TI_NOT_IMPLEMENTED
@@ -414,8 +415,23 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
         !cuda_library_path.empty()) {
       /*
         Half2 optimization for float16 atomic add
-        ...
+
+        [CHI IR]
+            TensorType<2 x f16> old_val = atomic_add(TensorType<2 x f16>
+        dest_ptr*, TensorType<2 x f16> val)
+
+        [CodeGen]
+            old_val_ptr = Alloca(TensorType<2 x f16>)
+
+            val_ptr = Alloca(TensorType<2 x f16>)
+            GEP(val_ptr, 0) = ExtractValue(val, 0)
+            GEP(val_ptr, 1) = ExtractValue(val, 1)
+
+            half2_atomic_add(dest_ptr, old_val_ptr, val_ptr)
+
+            old_val = Load(old_val_ptr)
       */
+      // Allocate old_val_ptr to store the result of atomic_add
       auto char_type = llvm::Type::getInt8Ty(*tlctx->get_this_thread_context());
       auto half_type = llvm::Type::getHalfTy(*tlctx->get_this_thread_context());
       auto ptr_type = llvm::PointerType::get(char_type, 0);
@@ -591,30 +607,23 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
     return true;  // on CUDA, pass the argument by value
   }
 
-  // === CHANGED SECTION: LLVM API CALL ===
-  llvm::Value *create_intrinsic_load(llvm::Value *ptr,
-                                     llvm::Type *ty) override {
-    // Issue an "__ldg" instruction to cache data in the read-only data cache.
-    // The `nvvm_ldg_global_*` intrinsics have been renamed to `nvvm_ldu_global_*`
-    // (load uniform).
-    auto intrin = ty->isFloatingPointTy() ? llvm::Intrinsic::nvvm_ldu_global_f
-                                          : llvm::Intrinsic::nvvm_ldu_global_i;
-    // Special treatment for bool types. As nvvm_ldg_global_i does not support
-    // 1-bit integer, so we convert them to i8.
-    if (ty->getScalarSizeInBits() == 1) {
-      auto *new_ty = tlctx->get_data_type<uint8>();
-      auto *new_ptr =
-          builder->CreatePointerCast(ptr, llvm::PointerType::get(new_ty, 0));
-      auto *v = builder->CreateIntrinsic(
-          intrin, {new_ty, llvm::PointerType::get(new_ty, 0)},
-          {new_ptr, tlctx->get_constant(new_ty->getScalarSizeInBits())});
-      return builder->CreateIsNotNull(v);
+  llvm::Value *TaskCodeGenCUDA::create_intrinsic_load(llvm::Value *ptr,
+                                                    llvm::Type *ty) {
+  #if LLVM_VERSION_MAJOR >= 20
+      // ldg intrinsics removed – use normal load from AS(1) + invariant metadata
+      auto *load = builder->CreateLoad(ty, ptr);          // ld.global.ca
+      load->setMetadata(llvm::LLVMContext::MD_invariant_load,
+                        llvm::MDNode::get(*llvm_context, {}));
+      return load;
+  #else
+      auto intrin = ty->isFloatingPointTy()
+                        ? llvm::Intrinsic::nvvm_ldg_global_f
+                        : llvm::Intrinsic::nvvm_ldg_global_i;
+      return builder->CreateIntrinsic(intrin,
+              {ty, llvm::PointerType::get(ty, 0)},
+              {ptr, tlctx->get_constant(ty->getScalarSizeInBits())});
+  #endif
     }
-    return builder->CreateIntrinsic(
-        intrin, {ty, llvm::PointerType::get(ty, 0)},
-        {ptr, tlctx->get_constant(ty->getScalarSizeInBits())});
-  }
-  // === END OF CHANGED SECTION ===
 
   void visit(GlobalLoadStmt *stmt) override {
     if (auto get_ch = stmt->src->cast<GetChStmt>()) {

From d20c823da84b92b0355c30272a8596010b7047a3 Mon Sep 17 00:00:00 2001
From: Tiffany Mintz <Tiffany.Mintz@amd.com>
Date: Thu, 28 Aug 2025 16:04:21 -0500
Subject: [PATCH 23/42] additional updates for llvm 20

---
 .github/workflows/scripts/ti_build/llvm.py |   2 +-
 python/taichi/CHANGELOG.md                 |  90 +++++--------
 taichi/codegen/cpu/codegen_cpu.cpp         | 139 ++++++++++++++-------
 taichi/common/core.h                       |  16 +--
 taichi/math/linalg.h                       |   5 +-
 taichi/runtime/llvm/llvm_context.cpp       |  36 ++----
 tests/cpp/llvm/llvm_offline_cache_test.cpp |   2 +-
 7 files changed, 145 insertions(+), 145 deletions(-)

diff --git a/.github/workflows/scripts/ti_build/llvm.py b/.github/workflows/scripts/ti_build/llvm.py
index 9619c11ba42b9..16cdc4a078f75 100644
--- a/.github/workflows/scripts/ti_build/llvm.py
+++ b/.github/workflows/scripts/ti_build/llvm.py
@@ -53,7 +53,7 @@ def setup_llvm() -> None:
         raise RuntimeError(f"Unsupported platform: {u.system} {u.machine}")
 
     # We should use LLVM toolchains shipped with OS.
-    # path_prepend('PATH', out / 'bin')
+     #path_prepend('PATH', out / 'bin')
     if (u.system, u.machine) not in (("Linux", "arm64"), ("Linux", "aarch64")):
         os.environ["LLVM_DIR"] = "/usr/lib/llvm-20/cmake"
         os.environ["CUDA_HOME"] = "/usr/local/cuda"
diff --git a/python/taichi/CHANGELOG.md b/python/taichi/CHANGELOG.md
index ae49964ed6633..6c2e5d41852d7 100644
--- a/python/taichi/CHANGELOG.md
+++ b/python/taichi/CHANGELOG.md
@@ -1,73 +1,39 @@
 Highlights:
    - **Build system**
-      - Use brew clang compiler on mac (#8704) (by **Hugh Perkins**)
-      - Update VMA and Vulkan-Headers to support Vulkan 1.3 (#8680) (by **Antonio Ferreras**)
-   - **Examples**
-      - Factorize cpp examples into separate executables (#8709) (by **Hugh Perkins**)
-   - **GUI**
-      - Support ndarray input (#8661) (by **白定**)
-   - **IR optimization passes**
-      - Remove redundant cfg optimization, to fix struct vec crash bug (#8691) (by **Hugh Perkins**)
-   - **Unified Device API**
-      - Fix the Unified Allocator to no longer return first two allocations as dupes (#8705) (by **Hugh Perkins**)
+      - Revert "Drop OpenGL build (#8751)" (#8753) (by **Proton**)
+      - Drop OpenGL build (#8751) (by **Proton**)
 
 Full changelog:
-   - Update header includes and fix configuration access in JIT compilation for CUDA (by **johnnynunez**)
-   - Fix header includes and update LLVM API calls in CPU code generation (by **johnnynunez**)
-   - Update CUDA header includes and fix call signatures in JIT compilation (by **johnnynunez**)
-   - Fix header includes and update configuration access in JIT compilation for CUDA (by **johnnynunez**)
+   - additional cuda updates for llvm20; merging from 8ca16de9a24e82baaed1deac80f71bd541c131ca to add2df35782768f5ad243cacef0922ec00890ba5 from johnnynunez/taichi master branch (by **Tiffany Mintz**)
+   - removing blackwell updates; restoring window_base.cpp include (by **Tiffany Mintz**)
+   - removing updates for blackwell (by **Tiffany Mintz**)
+   - implementing error function and cuda updates; merging 5449f722e796fac22d3b7a041488d77ac5e49b25 to 649c58d7a7991080e5dae510be72f7c6ad528001 from johnnynunez/taichi master branch; some of the changes from these were captured in the previous commit to rocm/taichi (by **Tiffany Mintz**)
+   - cmake build updates, header fixes; Merging from commits ebdc72be75443d8785703e0c8af1236be237fc04 to 9d140238ed1fa1f33b077a5fe74d61905b0167d4 from johnnynunez/taichi master branch (by **Tiffany Mintz**)
    - Fix header include for program in codegen_cpu.cpp (by **johnnynunez**)
    - Update header includes and fix LLVM API calls in CPU code generation (by **johnnynunez**)
-   - Fix header include for PassBuilder in jit_cuda.cpp (by **johnnynunez**)
    - Refactor JIT compilation in CUDA: update function pointers, enhance PTX handling, and implement new pass manager setup (by **johnnynunez**)
    - Fix include directive for IR analysis header in codegen_cuda.cpp (by **johnnynunez**)
-   - Update Vulkan setup to check for x86_64 architecture on Linux (by **johnnynunez**)
    - Add CHANGELOG.md to document recent updates and improvements (by **johnnynunez**)
    - Update LLVM API calls in codegen_cuda.cpp for compatibility with recent changes (by **johnnynunez**)
    - LLVM-20 (by **johnnynunez**)
-   - Refactor CMakeLists.txt and improve IR dumping functionality in various files (by **johnnynunez**)
-   - Remove unnecessary print statement from IRNode handling (by **johnnynunez**)
-   - Refactor ASTBuilder constructor declaration for improved clarity (by **johnnynunez**)
-   - Add constructor for ASTBuilder to initialize state (by **johnnynunez**)
-   - Add IR dumping and loading functionality in LLVM codegen (by **johnnynunez**)
-   - Update llvm.py (by **Johnny**)
-   - Update check_clang_tidy.sh (by **Johnny**)
-   - Update llvm.py (by **Johnny**)
-   - Update llvm.py (by **Johnny**)
-   - Update llvm.py (by **Johnny**)
-   - Update compiler.py (by **Johnny**)
-   - fix (by **Johnny**)
-   - fix (by **Johnny**)
-   - Merge branch 'taichi-dev:master' into master (by **Johnny**)
-   - [build] Add support for extracting .tar.xz archives in dep.py (#8721) (by **Johnny**)
-   - Vulkan (#9) (by **Johnny**)
-   - [update] Update hash values for various dependencies (#1) (by **Johnny**)
-   - Vulkan (#2) (by **Johnny**)
-   - [fix] Update platform detection for manylinux support on ARM architectures (#3) (by **Johnny**)
-   - [fix] Update sccache download URLs to version 0.10.0 for various platforms (#4) (by **Johnny**)
-   - [build] Replace libtinfo-dev with libncurses-dev in package dependencies (#5) (by **Johnny**)
-   - [fix] Update setup function to use Miniforge and correct download URLs (#6) (by **Johnny**)
-   - [llvm] Enhance LLVM installation logic for specific Linux architectures (#7) (by **Johnny**)
-   - [feature] Add support for extracting .tar.xz archives in dep.py (#8) (by **Johnny**)
-   - [Rhi] [bug] Fix the Unified Allocator to no longer return first two allocations as dupes (#8705) (by **Hugh Perkins**)
-   - [Build] Use brew clang compiler on mac (#8704) (by **Hugh Perkins**)
-   - [Example] Factorize cpp examples into separate executables (#8709) (by **Hugh Perkins**)
-   - [lang] Add SNode.snode_tree_id (#8697) (by **Hugh Perkins**)
-   - [misc] Bug report template instructions no longer render in final bug report (#8699) (by **Hugh Perkins**)
-   - [Opt] Remove redundant cfg optimization, to fix struct vec crash bug (#8691) (by **Hugh Perkins**)
-   - [ci] Do not setup Vulkan SDK (#8692) (by **Proton**)
-   - [misc] Fix cannot name an alias template, fixes #8683 (by **Hugh Perkins**)
-   - [build] [mac] Fix mac build symbol export failure (#8690) (by **Hugh Perkins**)
-   - [ci] Switch git proxy (#8681) (by **Proton**)
-   - [Build] Update VMA and Vulkan-Headers to support Vulkan 1.3 (#8680) (by **Antonio Ferreras**)
-   - [GUI] Support ndarray input (#8661) (by **白定**)
-   - [ci] Bump minimum python version to 3.9 to align with pytorch (#8676) (by **Bob Cao**)
-   - [vulkan] Let VMA grab fptrs itself (#8672) (by **Bob Cao**)
-   - [ci] Fix pre-commit rants, disable deprecated GitHub integration (#8671) (by **Proton**)
-   - [amdgpu] Calculate mcpu_ and compute_capability_ properly and with ROCm 6 compat (#8667) (by **Gavin Zhao**)
-   - [ci] Disable paddle tests, limit cmake version (#8670) (by **Proton**)
-   - [ci] Chasing migrated bot master (#8657) (by **Proton**)
-   - [doc] Fix typo in README.md (#8612) (by **Ikko Eltociear Ashimine**)
-   - [ci] Fixing releasing (#8651) (by **Proton**)
-   - [ci] Upgrade deprecated (up|down)load-artifacts@v3 actions to v4 (#8645) (by **Proton**)
-   - [Sparse Matrix] Fix dimension mismatch error in sparse matrix multiplication (#8644) (by **pengyu**)
+   - Merge branch 'amd-integration' into amd-develop (by **tmm77**)
+   - Merge pull request #4 from ROCm/master (by **tmm77**)
+   - Merge branch 'amd-develop' into master (by **tmm77**)
+   - Merge pull request #3 from taichi-dev/master (by **tmm77**)
+   - [ir] Sanitize SPIRV debugprintf string (#8752) (by **Proton**)
+   - [ci] Windows needs Vulkan SDK (#8754) (by **Proton**)
+   - [Build] Revert "Drop OpenGL build (#8751)" (#8753) (by **Proton**)
+   - [Build] Drop OpenGL build (#8751) (by **Proton**)
+   - [build] Update minimum requirement for CMAKE (#8703) (by **Linull/李林**)
+   - [build] Add Python 3.13 build (#8748) (by **Proton**)
+   - [build] Identify aarch64 wheels (#8716) (by **Johnny**)
+   - [build] Update setup function to use Miniforge and correct download URLs (#8719) (by **Johnny**)
+   - [build] Update Vulkan SDK version (#8749) (by **T.Yamada**)
+   - [build] Update sccache download URLs to version 0.10.0 for various plat… (#8717) (by **Johnny**)
+   - [build] Replace libtinfo-dev with libncurses-dev in package dependencies (#8718) (by **Johnny**)
+   - [ci] Migrate to new threading interface (#8731) (by **Emmanuel Ferdman**)
+   - ROCm port of taichi (by **Bhanu Kiran Atturu**)
+   - setting architecture to gpu (by **tmm77**)
+   - Taichi Multistage Dockerfile (by **Bhavesh Lad**)
+   - Taicho Multistage Dockerfile (by **Bhavesh Lad**)
+   - fix: Patch to avoid the need to fetch source to build Taichi wheel (by **Bhavesh Lad**)
diff --git a/taichi/codegen/cpu/codegen_cpu.cpp b/taichi/codegen/cpu/codegen_cpu.cpp
index 495a0d512bd8f..74e3a0438abc2 100644
--- a/taichi/codegen/cpu/codegen_cpu.cpp
+++ b/taichi/codegen/cpu/codegen_cpu.cpp
@@ -12,18 +12,17 @@
 #include "taichi/ir/analysis.h"
 #include "taichi/analysis/offline_cache_util.h"
 
-#include "llvm/TargetParser/Host.h"
+#if LLVM_VERSION_MAJOR >= 16
+#include <llvm/TargetParser/Host.h>
+#else
+#include <llvm/Support/Host.h>
+#endif
+
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Transforms/IPO.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 #include "llvm/Passes/PassBuilder.h"
-#include "llvm/Passes/StandardInstrumentations.h"
-#include "llvm/Transforms/Scalar/IndVarSimplify.h"
-#include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
-#include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
-#include "llvm/Transforms/Scalar/EarlyCSE.h"
-#include "llvm/Transforms/IPO/FunctionAttrs.h"
-#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
 
 namespace taichi::lang {
 
@@ -44,24 +43,36 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM {
 
   void create_offload_range_for(OffloadedStmt *stmt) override {
     int step = 1;
+
+    // In parallel for-loops reversing the order doesn't make sense.
+    // However, we may need to support serial offloaded range for's in the
+    // future, so it still makes sense to reverse the order here.
     if (stmt->reversed) {
       step = -1;
     }
+
     auto *tls_prologue = create_xlogue(stmt->tls_prologue);
+
+    // The loop body
     llvm::Function *body;
     {
       auto guard = get_function_creation_guard(
           {llvm::PointerType::get(get_runtime_type("RuntimeContext"), 0),
            llvm::PointerType::get(*llvm_context, 0),
            tlctx->get_data_type<int>()});
+
       auto loop_var = create_entry_block_alloca(PrimitiveType::i32);
       loop_vars_llvm[stmt].push_back(loop_var);
       builder->CreateStore(get_arg(2), loop_var);
       stmt->body->accept(this);
+
       body = guard.body;
     }
+
     llvm::Value *epilogue = create_xlogue(stmt->tls_epilogue);
+
     auto [begin, end] = get_range_for_bounds(stmt);
+
     call("cpu_parallel_range_for", get_arg(0),
          tlctx->get_constant(stmt->num_cpu_threads), begin, end,
          tlctx->get_constant(step), tlctx->get_constant(stmt->block_dim),
@@ -70,19 +81,23 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM {
 
   void create_offload_mesh_for(OffloadedStmt *stmt) override {
     auto *tls_prologue = create_mesh_xlogue(stmt->tls_prologue);
+
     llvm::Function *body;
     {
       auto guard = get_function_creation_guard(
           {llvm::PointerType::get(get_runtime_type("RuntimeContext"), 0),
            llvm::PointerType::get(*llvm_context, 0),
            tlctx->get_data_type<int>()});
+
       for (int i = 0; i < stmt->mesh_prologue->size(); i++) {
         auto &s = stmt->mesh_prologue->statements[i];
         s->accept(this);
       }
+
       if (stmt->bls_prologue) {
         stmt->bls_prologue->accept(this);
       }
+
       auto loop_test_bb =
           llvm::BasicBlock::Create(*llvm_context, "loop_test", func);
       auto loop_body_bb =
@@ -93,6 +108,7 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM {
           create_entry_block_alloca(llvm::Type::getInt32Ty(*llvm_context));
       builder->CreateStore(tlctx->get_constant(0), loop_index);
       builder->CreateBr(loop_test_bb);
+
       {
         builder->SetInsertPoint(loop_test_bb);
         auto *loop_index_load =
@@ -103,6 +119,7 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM {
                          ->second]);
         builder->CreateCondBr(cond, loop_body_bb, func_exit);
       }
+
       {
         builder->SetInsertPoint(loop_body_bb);
         loop_vars_llvm[stmt].push_back(loop_index);
@@ -118,12 +135,16 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM {
         builder->CreateBr(loop_test_bb);
         builder->SetInsertPoint(func_exit);
       }
+
       if (stmt->bls_epilogue) {
         stmt->bls_epilogue->accept(this);
       }
+
       body = guard.body;
     }
+
     llvm::Value *epilogue = create_mesh_xlogue(stmt->tls_epilogue);
+
     call("cpu_parallel_mesh_for", get_arg(0),
          tlctx->get_constant(stmt->num_cpu_threads),
          tlctx->get_constant(stmt->mesh->num_patches),
@@ -137,6 +158,12 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM {
     bls_buffer = new llvm::GlobalVariable(
         *module, type, false, llvm::GlobalValue::ExternalLinkage, nullptr,
         "bls_buffer", nullptr, llvm::GlobalVariable::LocalExecTLSModel, 0);
+    /* module->getOrInsertGlobal("bls_buffer", type);
+    bls_buffer = module->getNamedGlobal("bls_buffer");
+    bls_buffer->setAlignment(llvm::MaybeAlign(8));*/ //
+
+    // initialize the variable with an undef value to ensure it is added to the
+    // symbol table
     bls_buffer->setInitializer(llvm::UndefValue::get(type));
   }
 
@@ -204,7 +231,6 @@ static llvm::Triple get_host_target_triple() {
   }
   return expected_jtmb->getTargetTriple();
 }
-
 }  // namespace
 
 #ifdef TI_WITH_LLVM
@@ -219,11 +245,9 @@ LLVMCompiledTask KernelCodeGenCPU::compile_task(
 }
 
 void KernelCodeGenCPU::optimize_module(llvm::Module *module) {
-  TI_AUTO_PROF;
-
+  TI_AUTO_PROF
   const auto &compile_config = get_compile_config();
   auto triple = get_host_target_triple();
-  module->setTargetTriple(triple.str());
 
   std::string err_str;
   const llvm::Target *target =
@@ -233,31 +257,50 @@ void KernelCodeGenCPU::optimize_module(llvm::Module *module) {
   llvm::TargetOptions options;
   if (compile_config.fast_math) {
     options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
-    options.UnsafeFPMath = true;
-    options.NoInfsFPMath = true;
-    options.NoNaNsFPMath = true;
+    options.UnsafeFPMath = 1;
+    options.NoInfsFPMath = 1;
+    options.NoNaNsFPMath = 1;
+  } else {
+    options.AllowFPOpFusion = llvm::FPOpFusion::Strict;
+    options.UnsafeFPMath = 0;
+    options.NoInfsFPMath = 0;
+    options.NoNaNsFPMath = 0;
   }
+
   options.HonorSignDependentRoundingFPMathOption = false;
   options.NoZerosInBSS = false;
   options.GuaranteedTailCallOpt = false;
 
+#if LLVM_VERSION_MAJOR >= 18
+  const auto opt_level = llvm::CodeGenOptLevel::Aggressive;
+#else
+  const auto opt_level = llvm::CodeGenOpt::Aggressive;
+#endif
   llvm::StringRef mcpu = llvm::sys::getHostCPUName();
-  // FIX: Use llvm::CodeGenOptLevel
   std::unique_ptr<llvm::TargetMachine> target_machine(
       target->createTargetMachine(triple.str(), mcpu.str(), "", options,
                                   llvm::Reloc::PIC_, llvm::CodeModel::Small,
-                                  compile_config.opt_level > 0 ? llvm::CodeGenOptLevel::Default : llvm::CodeGenOptLevel::None));
+                                  opt_level));
+  TI_ERROR_UNLESS(target_machine.get(), "Could not allocate target machine!");
 
-  TI_ERROR_UNLESS(target_machine, "Could not allocate target machine!");
+  module->setTargetTriple(triple.str());
   module->setDataLayout(target_machine->createDataLayout());
 
+  // Create the new analysis manager
   llvm::LoopAnalysisManager LAM;
   llvm::FunctionAnalysisManager FAM;
   llvm::CGSCCAnalysisManager CGAM;
   llvm::ModuleAnalysisManager MAM;
-  llvm::PassBuilder PB(target_machine.get());
 
-  FAM.registerPass([&] { return target_machine->getTargetIRAnalysis(); });
+  // Create the new pass builder
+  llvm::PipelineTuningOptions PTO;
+  PTO.LoopInterleaving = true;
+  PTO.LoopVectorization = true;
+  PTO.SLPVectorization = true;
+  PTO.LoopUnrolling = true;
+  PTO.ForgetAllSCEVInLoopUnroll = true;
+
+  llvm::PassBuilder PB(target_machine.get(), PTO);
 
   PB.registerModuleAnalyses(MAM);
   PB.registerCGSCCAnalyses(CGAM);
@@ -265,40 +308,48 @@ void KernelCodeGenCPU::optimize_module(llvm::Module *module) {
   PB.registerLoopAnalyses(LAM);
   PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
 
-  llvm::OptimizationLevel opt_level = llvm::OptimizationLevel::O3;
-
-  llvm::ModulePassManager MPM;
-  if (compile_config.opt_level > 0) {
-    MPM = PB.buildPerModuleDefaultPipeline(opt_level);
-  }
-
-  llvm::FunctionPassManager FPM;
-  FPM.addPass(llvm::createFunctionToLoopPassAdaptor(llvm::LoopStrengthReducePass()));
-  FPM.addPass(llvm::createFunctionToLoopPassAdaptor(llvm::IndVarSimplifyPass()));
-  FPM.addPass(llvm::SeparateConstOffsetFromGEPPass(false));
-  FPM.addPass(llvm::EarlyCSEPass(true));
-  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+  target_machine->registerPassBuilderCallbacks(PB);
 
-  llvm::SmallString<0> asm_buffer;
-  if (compile_config.print_kernel_asm) {
-    llvm::raw_svector_ostream asm_stream(asm_buffer);
-    // FIX: Pass the stream by reference, not by pointer
-    if (target_machine->addPassesToEmitFile(
-            MPM, asm_stream, nullptr, llvm::CodeGenFileType::AssemblyFile)) {
-      TI_ERROR("Failed to addPassesToEmitFile");
-    }
-  }
+  llvm::ModulePassManager MPM =
+      PB.buildPerModuleDefaultPipeline(llvm::OptimizationLevel::O3);
 
   {
     TI_PROFILER("llvm_module_pass");
     MPM.run(*module, MAM);
   }
 
+  if (llvm::verifyModule(*module, &llvm::errs())) {
+    module->print(llvm::errs(), nullptr);
+    TI_ERROR("LLVM Module broken");
+  }
+
   if (compile_config.print_kernel_asm) {
+    llvm::SmallString<8> outstr;
+    llvm::raw_svector_ostream ostream(outstr);
+    ostream.SetUnbuffered();
+
+    llvm::legacy::PassManager LPM;
+    LPM.add(llvm::createTargetTransformInfoWrapperPass(
+        target_machine->getTargetIRAnalysis()));
+
+    // Override default to generate verbose assembly.
+    target_machine->Options.MCOptions.AsmVerbose = true;
+
+#if LLVM_VERSION_MAJOR >= 18
+    const auto file_type = llvm::CodeGenFileType::AssemblyFile;
+#else
+    const auto file_type = llvm::CGFT_AssemblyFile;
+#endif
+    bool fail =
+        target_machine->addPassesToEmitFile(LPM, ostream, nullptr, file_type);
+    TI_ERROR_IF(fail, "Failed to setup the CPU assembly writer");
+    LPM.run(*module);
+
     static FileSequenceWriter writer(
         "taichi_kernel_cpu_llvm_ir_optimized_asm_{:04d}.s",
         "optimized assembly code (CPU)");
-    writer.write(std::string(asm_buffer.str()));
+    std::string buffer(outstr.begin(), outstr.end());
+    writer.write(buffer);
   }
 
   if (compile_config.print_kernel_llvm_ir_optimized) {
@@ -306,12 +357,12 @@ void KernelCodeGenCPU::optimize_module(llvm::Module *module) {
       TI_INFO("Functions with > 100 instructions in optimized LLVM IR:");
       TaichiLLVMContext::print_huge_functions(module);
     }
+
     static FileSequenceWriter writer(
         "taichi_kernel_cpu_llvm_ir_optimized_{:04d}.ll",
         "optimized LLVM IR (CPU)");
     writer.write(module);
   }
 }
-
 #endif  // TI_WITH_LLVM
 }  // namespace taichi::lang
\ No newline at end of file
diff --git a/taichi/common/core.h b/taichi/common/core.h
index 0f08bb594548a..e704943069672 100644
--- a/taichi/common/core.h
+++ b/taichi/common/core.h
@@ -170,35 +170,35 @@ using real = float32;
 real constexpr operator""_f(long double v) {
   return real(v);
 }
-real constexpr operator"" _f(unsigned long long v) {
+real constexpr operator""_f(unsigned long long v) {
   return real(v);
 }
 
 float32 constexpr operator""_f32(long double v) {
   return float32(v);
 }
-float32 constexpr operator"" _f32(unsigned long long v) {
+float32 constexpr operator""_f32(unsigned long long v) {
   return float32(v);
 }
 
-float32 constexpr operator"" _fs(long double v) {
+float32 constexpr operator""_fs(long double v) {
   return float32(v);
 }
-float32 constexpr operator"" _fs(unsigned long long v) {
+float32 constexpr operator""_fs(unsigned long long v) {
   return float32(v);
 }
 
-float64 constexpr operator"" _f64(long double v) {
+float64 constexpr operator""_f64(long double v) {
   return float64(v);
 }
-float64 constexpr operator"" _f64(unsigned long long v) {
+float64 constexpr operator""_f64(unsigned long long v) {
   return float64(v);
 }
 
-float64 constexpr operator"" _fd(long double v) {
+float64 constexpr operator""_fd(long double v) {
   return float64(v);
 }
-float64 constexpr operator"" _fd(unsigned long long v) {
+float64 constexpr operator""_fd(unsigned long long v) {
   return float64(v);
 }
 
diff --git a/taichi/math/linalg.h b/taichi/math/linalg.h
index 0f5ee6fd053c2..628572ac4976c 100644
--- a/taichi/math/linalg.h
+++ b/taichi/math/linalg.h
@@ -242,9 +242,8 @@ struct VectorND : public VectorNDBase<dim__, T, ISE> {
   }
 
   TI_FORCE_INLINE VectorND &operator=(const VectorND &o) {
-    // memcpy(this, &o, sizeof(*this));
-    for (int i = 0; i < dim; ++i) {
-      entries[i] = o[i];
+    for (int i = 0; i < dim; i++) {
+      this->d[i] = o.d[i];
     }
     return *this;
   }
diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp
index 2687b223275e7..5e206f7c50e2b 100644
--- a/taichi/runtime/llvm/llvm_context.cpp
+++ b/taichi/runtime/llvm/llvm_context.cpp
@@ -39,8 +39,6 @@
 #include "llvm/Linker/Linker.h"
 #include "llvm/Demangle/Demangle.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
-#include "llvm/Transforms/IPO/StripDeadPrototypes.h"
-#include "llvm/Passes/PassBuilder.h"
 
 #include "taichi/util/lang_util.h"
 #include "taichi/jit/jit_session.h"
@@ -323,16 +321,6 @@ static void remove_useless_cuda_libdevice_functions(llvm::Module *module) {
       "lgammaf",
       "tgamma",
       "lgamma",
-      "erff",
-      "erfinvf",
-      "erfcf",
-      "erfcxf",
-      "erfcinvf",
-      "erf",
-      "erfinv",
-      "erfcx",
-      "erfcinv",
-      "erfc",
   };
   for (auto fn : function_name_list) {
     module->getFunction("__nv_" + fn)->eraseFromParent();
@@ -341,21 +329,17 @@ static void remove_useless_cuda_libdevice_functions(llvm::Module *module) {
 }
 
 // Note: runtime_module = init_module < struct_module
+std::unique_ptr<llvm::Module> TaichiLLVMContext::clone_runtime_module() {
+  TI_AUTO_PROF
+  auto *mod = get_this_thread_runtime_module();
 
-std::unique_ptr<llvm::Module> TaichiLLVMContext::clone_runtime_module(
-    const std::string &name) {
-  TI_AUTO_PROF;
-  auto cloned = llvm::CloneModule(*runtime_module_.get());
-  cloned->setModuleIdentifier(name);
-
-  llvm::PassBuilder PB;
-  llvm::ModuleAnalysisManager MAM;
-  PB.registerModuleAnalyses(MAM);
-
-  llvm::ModulePassManager MPM;
-  MPM.addPass(llvm::StripDeadPrototypesPass());
+  std::unique_ptr<llvm::Module> cloned;
+  {
+    TI_PROFILER("clone module");
+    cloned = llvm::CloneModule(*mod);
+  }
 
-  MPM.run(*cloned, MAM);
+  TI_ASSERT(cloned != nullptr);
 
   return cloned;
 }
@@ -1171,4 +1155,4 @@ TaichiLLVMContext::get_struct_type_with_data_layout(const StructType *old_ty,
 
 TI_REGISTER_TASK(make_slim_libdevice);
 
-}  // namespace taichi::lang
+}  // namespace taichi::lang
\ No newline at end of file
diff --git a/tests/cpp/llvm/llvm_offline_cache_test.cpp b/tests/cpp/llvm/llvm_offline_cache_test.cpp
index 0cdc387f9488b..b891976af3faa 100644
--- a/tests/cpp/llvm/llvm_offline_cache_test.cpp
+++ b/tests/cpp/llvm/llvm_offline_cache_test.cpp
@@ -9,7 +9,7 @@
 
 namespace fs = std::filesystem;
 
-#include "llvm/ADT/Triple.h"
+#include "llvm/TargetParser/Triple.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"

From f0ca790c58b56ee0c3d079703faa89d9cc45a982 Mon Sep 17 00:00:00 2001
From: Tiffany Mintz <tiffany.mintz@amd.com>
Date: Tue, 2 Sep 2025 17:00:41 -0400
Subject: [PATCH 24/42] fix build issues with llvm 20 update

---
 python/taichi/CHANGELOG.md           |   1 +
 taichi/codegen/cuda/codegen_cuda.cpp |  44 ++--
 taichi/runtime/cuda/jit_cuda.cpp     | 303 +++++++++++++--------------
 taichi/runtime/cuda/jit_cuda.h       |   4 +-
 4 files changed, 179 insertions(+), 173 deletions(-)

diff --git a/python/taichi/CHANGELOG.md b/python/taichi/CHANGELOG.md
index 6c2e5d41852d7..b09dcfba95353 100644
--- a/python/taichi/CHANGELOG.md
+++ b/python/taichi/CHANGELOG.md
@@ -4,6 +4,7 @@ Highlights:
       - Drop OpenGL build (#8751) (by **Proton**)
 
 Full changelog:
+   - additional updates for llvm 20 (by **Tiffany Mintz**)
    - additional cuda updates for llvm20; merging from 8ca16de9a24e82baaed1deac80f71bd541c131ca to add2df35782768f5ad243cacef0922ec00890ba5 from johnnynunez/taichi master branch (by **Tiffany Mintz**)
    - removing blackwell updates; restoring window_base.cpp include (by **Tiffany Mintz**)
    - removing updates for blackwell (by **Tiffany Mintz**)
diff --git a/taichi/codegen/cuda/codegen_cuda.cpp b/taichi/codegen/cuda/codegen_cuda.cpp
index bdfc70e9ef342..ce931bbffe74a 100644
--- a/taichi/codegen/cuda/codegen_cuda.cpp
+++ b/taichi/codegen/cuda/codegen_cuda.cpp
@@ -17,7 +17,6 @@
 #include "taichi/ir/analysis.h"
 #include "taichi/ir/transforms.h"
 #include "taichi/codegen/codegen_utils.h"
-#include "llvm/Config/llvm-config.h"
 
 namespace taichi::lang {
 
@@ -607,23 +606,30 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
     return true;  // on CUDA, pass the argument by value
   }
 
-  llvm::Value *TaskCodeGenCUDA::create_intrinsic_load(llvm::Value *ptr,
-                                                    llvm::Type *ty) {
-  #if LLVM_VERSION_MAJOR >= 20
-      // ldg intrinsics removed – use normal load from AS(1) + invariant metadata
-      auto *load = builder->CreateLoad(ty, ptr);          // ld.global.ca
-      load->setMetadata(llvm::LLVMContext::MD_invariant_load,
-                        llvm::MDNode::get(*llvm_context, {}));
-      return load;
-  #else
-      auto intrin = ty->isFloatingPointTy()
-                        ? llvm::Intrinsic::nvvm_ldg_global_f
-                        : llvm::Intrinsic::nvvm_ldg_global_i;
-      return builder->CreateIntrinsic(intrin,
-              {ty, llvm::PointerType::get(ty, 0)},
-              {ptr, tlctx->get_constant(ty->getScalarSizeInBits())});
-  #endif
-    }
+  llvm::Value *create_intrinsic_load(llvm::Value *ptr,
+                                     llvm::Type *ty) override {
+    // The llvm.nvvm.ldg.global.* intrinsics have been removed.
+    // They are replaced by a standard load from global address space 1
+    // with !invariant.load metadata.
+
+    // The address space for read-only cache loads is 1 (global).
+    llvm::PointerType *ptr_ty_addrspace_1 = llvm::PointerType::get(ty, 1);
+
+    // Cast the input pointer to the correct address space.
+    llvm::Value *cast_ptr =
+        builder->CreateAddrSpaceCast(ptr, ptr_ty_addrspace_1);
+
+    // Create the load instruction.
+    llvm::LoadInst *load = builder->CreateLoad(ty, cast_ptr);
+
+    // Attach the !invariant.load metadata.
+    llvm::MDNode *invariant_load_metadata =
+        llvm::MDNode::get(builder->getContext(), {});
+    load->setMetadata(llvm::LLVMContext::MD_invariant_load,
+                      invariant_load_metadata);
+
+    return load;
+  }
 
   void visit(GlobalLoadStmt *stmt) override {
     if (auto get_ch = stmt->src->cast<GetChStmt>()) {
@@ -786,4 +792,4 @@ LLVMCompiledTask KernelCodeGenCUDA::compile_task(
   return gen.run_compilation();
 }
 
-}  // namespace taichi::lang
\ No newline at end of file
+}  // namespace taichi::lang
diff --git a/taichi/runtime/cuda/jit_cuda.cpp b/taichi/runtime/cuda/jit_cuda.cpp
index 958d98bdbf58b..46860c562678c 100644
--- a/taichi/runtime/cuda/jit_cuda.cpp
+++ b/taichi/runtime/cuda/jit_cuda.cpp
@@ -1,64 +1,15 @@
+
 #include "taichi/runtime/cuda/jit_cuda.h"
 #include "taichi/runtime/llvm/llvm_context.h"
-#include "taichi/util/file_sequence_writer.h"
-#include "taichi/runtime/program_impls/llvm/llvm_program.h"
 
 #include "llvm/Passes/PassBuilder.h"
-#include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
-#include "llvm/Transforms/Scalar/IndVarSimplify.h"
-#include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
-#include "llvm/Transforms/Scalar/EarlyCSE.h"
-#include "llvm/Target/TargetMachine.h"
-
-// This is the crucial include for CUDA driver types like CUjit_option
-#include <cuda.h>
-#include <string>
-#include <vector>
 
 namespace taichi::lang {
 
 #if defined(TI_WITH_CUDA)
 
-// Helper function to check if runtime_initialize exists
-bool module_has_runtime_initialize(
-    const llvm::Module::FunctionListType &function_list) {
-  for (const auto &func : function_list) {
-    if (func.getName() == "runtime_initialize") {
-      return true;
-    }
-  }
-  return false;
-}
-
-// Helper function to get a representative name for dumping files
-std::string moduleToDumpName(llvm::Module *M) {
-  std::string dumpName = M->getName().str();
-  if (M->getFunctionList().empty()) {
-    return dumpName;
-  }
-  if (!module_has_runtime_initialize(M->getFunctionList())) {
-    dumpName = M->getFunctionList().begin()->getName().str();
-  }
-  return dumpName;
-}
-
-JITModule *JITSessionCUDA::add_module(std::unique_ptr<llvm::Module> M,
-                                      int max_reg) {
-  const char *dump_ir_env = std::getenv("TAICHI_DUMP_IR");
-  if (dump_ir_env) {
-    const std::string dumpOutDir = "/tmp/ir/";
-    std::filesystem::create_directories(dumpOutDir);
-    std::string dumpName = moduleToDumpName(M.get());
-    std::string filename = dumpOutDir + "/" + dumpName + "_before_ptx.ll";
-    std::error_code EC;
-    llvm::raw_fd_ostream dest_file(filename, EC);
-    if (!EC) {
-      M->print(dest_file, nullptr);
-    } else {
-      TI_WARN("Failed to dump LLVM IR to file {}: {}", filename, EC.message());
-    }
-  }
-
+JITModule *JITSessionCUDA ::add_module(std::unique_ptr<llvm::Module> M,
+                                       int max_reg) {
   auto ptx = compile_module_to_ptx(M);
   if (this->config_.print_kernel_asm) {
     static FileSequenceWriter writer("taichi_kernel_nvptx_{:04d}.ptx",
@@ -66,59 +17,33 @@ JITModule *JITSessionCUDA::add_module(std::unique_ptr<llvm::Module> M,
     writer.write(ptx);
   }
 
-  if (dump_ir_env) {
-    const std::string dumpOutDir = "/tmp/ptx/";
-    std::filesystem::create_directories(dumpOutDir);
-    std::string dumpName = moduleToDumpName(M.get());
-    std::string filename = dumpOutDir + "/" + dumpName + ".ptx";
-    std::ofstream out_file(filename);
-    if (out_file.is_open()) {
-      out_file << ptx;
-      out_file.close();
-      TI_INFO("PTX dumped to: {}", filename);
-    }
-  }
-
-  const char *load_ptx_env = std::getenv("TAICHI_LOAD_PTX");
-  if (load_ptx_env) {
-    const std::string dumpOutDir = "/tmp/ptx/";
-    std::string dumpName = moduleToDumpName(M.get());
-    std::string filename = dumpOutDir + "/" + dumpName + ".ptx";
-    std::ifstream in_file(filename);
-    if (in_file.is_open()) {
-      TI_INFO("Loading PTX from file: {}", filename);
-      std::stringstream ptx_stream;
-      ptx_stream << in_file.rdbuf();
-      ptx = ptx_stream.str();
-      in_file.close();
-    } else {
-      TI_WARN("Failed to open PTX file for loading: {}", filename);
-    }
-  }
-  if (ptx.empty() || ptx.back() != '\0') {
-    ptx += '\0';
-  }
-
+  // TODO: figure out why using the guard leads to wrong tests results
+  // auto context_guard = CUDAContext::get_instance().get_guard();
   CUDAContext::get_instance().make_current();
+
+  // Create module for object
   void *cuda_module;
   TI_TRACE("PTX size: {:.2f}KB", ptx.size() / 1024.0);
   auto t = Time::get_time();
   TI_TRACE("Loading module...");
   [[maybe_unused]] auto _ = CUDAContext::get_instance().get_lock_guard();
 
-  std::vector<CUjit_option> options;
-  std::vector<void *> option_values;
-  unsigned int max_reg_uint = max_reg;
+  constexpr int max_num_options = 8;
+  int num_options = 0;
+  uint32 options[max_num_options];
+  void *option_values[max_num_options];
 
-  if (max_reg > 0) {
-    options.push_back(CU_JIT_MAX_REGISTERS);
-    option_values.push_back(reinterpret_cast<void *>(&max_reg_uint));
+  // Insert options
+  if (max_reg != 0) {
+    options[num_options] = CU_JIT_MAX_REGISTERS;
+    option_values[num_options] = &max_reg;
+    num_options++;
   }
 
-  CUDADriver::get_instance().module_load_data_ex(&cuda_module, ptx.c_str(),
-                                                 options.size(),
-                                                 options.data(),
-                                                 option_values.data());
+  TI_ASSERT(num_options <= max_num_options);
+
+  CUDADriver::get_instance().module_load_data_ex(
+      &cuda_module, ptx.c_str(), num_options, options, option_values);
   TI_TRACE("CUDA module load time : {}ms", (Time::get_time() - t) * 1000);
 
   modules.push_back(std::make_unique<JITModuleCUDA>(cuda_module));
@@ -126,27 +51,38 @@ JITModule *JITSessionCUDA::add_module(std::unique_ptr<llvm::Module> M,
 }
 
 std::string cuda_mattrs() {
-  return "+ptx" +
-         std::to_string(CUDAContext::get_instance().get_mcpu_version());
+  // TODO: upgrade to ptx78 as supported by LLVM 16
+  return "+ptx63";
 }
 
-std::string convert_name_for_ptx(std::string new_name) {
-  for (char &i : new_name) {
-    if (i == '@' || i == '?' || i == '$' || i == '<' || i == '>' ||
-        (!std::isalnum(i) && i != '_' && i != '.')) {
-      i = '_';
+std::string convert(std::string new_name) {
+  // Evil C++ mangling on Windows will lead to "unsupported characters in
+  // symbol" error in LLVM PTX printer. Convert here.
+  for (int i = 0; i < (int)new_name.size(); i++) {
+    if (new_name[i] == '@') {
+      new_name.replace(i, 1, "_at_");
+    } else if (new_name[i] == '?') {
+      new_name.replace(i, 1, "_qm_");
+    } else if (new_name[i] == '$') {
+      new_name.replace(i, 1, "_dl_");
+    } else if (new_name[i] == '<') {
+      new_name.replace(i, 1, "_lb_");
+    } else if (new_name[i] == '>') {
+      new_name.replace(i, 1, "_rb_");
+    } else if (!std::isalpha(new_name[i]) && !std::isdigit(new_name[i]) &&
+               new_name[i] != '_' && new_name[i] != '.') {
+      new_name.replace(i, 1, "_xx_");
     }
   }
-  if (!new_name.empty() && !isalpha(new_name[0]) && new_name[0] != '_' &&
-      new_name[0] != '.') {
-    new_name = "_" + new_name;
-  }
+  if (!new_name.empty())
+    TI_ASSERT(isalpha(new_name[0]) || new_name[0] == '_' || new_name[0] == '.');
   return new_name;
 }
 
 std::string JITSessionCUDA::compile_module_to_ptx(
     std::unique_ptr<llvm::Module> &module) {
   TI_AUTO_PROF
+  // Part of this function is borrowed from Halide::CodeGen_PTX_Dev.cpp
   if (llvm::verifyModule(*module, &llvm::errs())) {
     module->print(llvm::errs(), nullptr);
     TI_ERROR("LLVM Module broken");
@@ -160,14 +96,14 @@ std::string JITSessionCUDA::compile_module_to_ptx(
     writer.write(module.get());
   }
 
-  for (auto &f : module->globals()) {
-    f.setName(convert_name_for_ptx(f.getName().str()));
-  }
-  for (auto &f : *module) {
-    f.setName(convert_name_for_ptx(f.getName().str()));
-  }
+  for (auto &f : module->globals())
+    f.setName(convert(f.getName().str()));
+  for (auto &f : *module)
+    f.setName(convert(f.getName().str()));
 
   llvm::Triple triple(module->getTargetTriple());
+
+  // Allocate target machine
   std::string err_str;
   const llvm::Target *target =
       TargetRegistry::lookupTarget(triple.str(), err_str);
@@ -176,40 +112,85 @@ std::string JITSessionCUDA::compile_module_to_ptx(
   TargetOptions options;
   if (this->config_.fast_math) {
     options.AllowFPOpFusion = FPOpFusion::Fast;
-    options.UnsafeFPMath = true;
-    options.NoInfsFPMath = true;
-    options.NoNaNsFPMath = true;
+    // See NVPTXISelLowering.cpp
+    // Setting UnsafeFPMath true will result in approximations such as
+    // sqrt.approx in PTX for both f32 and f64
+    options.UnsafeFPMath = 1;
+    options.NoInfsFPMath = 1;
+    options.NoNaNsFPMath = 1;
+  } else {
+    options.AllowFPOpFusion = FPOpFusion::Strict;
+    options.UnsafeFPMath = 0;
+    options.NoInfsFPMath = 0;
+    options.NoNaNsFPMath = 0;
   }
-  options.HonorSignDependentRoundingFPMathOption = false;
-  options.NoZerosInBSS = false;
-  options.GuaranteedTailCallOpt = false;
 
+  options.HonorSignDependentRoundingFPMathOption = 0;
+  options.NoZerosInBSS = 0;
+  options.GuaranteedTailCallOpt = 0;
+
+#if LLVM_VERSION_MAJOR >= 18
+  const auto opt_level = llvm::CodeGenOptLevel::Aggressive;
+#else
+  const auto opt_level = llvm::CodeGenOpt::Aggressive;
+#endif
   std::unique_ptr<TargetMachine> target_machine(target->createTargetMachine(
       triple.str(), CUDAContext::get_instance().get_mcpu(), cuda_mattrs(),
-      options, llvm::Reloc::PIC_, llvm::CodeModel::Small,
-      config_.opt_level > 0 ? llvm::CodeGenOptLevel::Default
-                           : llvm::CodeGenOptLevel::None));
+      options, llvm::Reloc::PIC_, llvm::CodeModel::Small, opt_level));
+
+  TI_ERROR_UNLESS(target_machine.get(), "Could not allocate target machine!");
 
-  TI_ERROR_UNLESS(target_machine, "Could not allocate target machine!");
+  module->setTargetTriple(triple.str());
   module->setDataLayout(target_machine->createDataLayout());
 
+  // NVidia's libdevice library uses a __nvvm_reflect to choose
+  // how to handle denormalized numbers. (The pass replaces calls
+  // to __nvvm_reflect with a constant via a map lookup. The inliner
+  // pass then resolves these situations to fast code, often a single
+  // instruction per decision point.)
+  //
+  // The default is (more) IEEE like handling. FTZ mode flushes them
+  // to zero. (This may only apply to single-precision.)
+  //
+  // The libdevice documentation covers other options for math accuracy
+  // such as replacing division with multiply by the reciprocal and
+  // use of fused-multiply-add, but they do not seem to be controlled
+  // by this __nvvvm_reflect mechanism and may be flags to earlier compiler
+  // passes.
   const auto kFTZDenorms = 1;
+
+  // Insert a module flag for the FTZ handling.
   module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz",
                         kFTZDenorms);
+
   if (kFTZDenorms) {
     for (llvm::Function &fn : *module) {
+      /* nvptx-f32ftz was deprecated.
+       *
+       * https://github.com/llvm/llvm-project/commit/a4451d88ee456304c26d552749aea6a7f5154bde#diff-6fda74ef428299644e9f49a2b0994c0d850a760b89828f655030a114060d075a
+       */
       fn.addFnAttr("denormal-fp-math-f32", "preserve-sign");
+
+      // Use unsafe fp math for sqrt.approx instead of sqrt.rn
       fn.addFnAttr("unsafe-fp-math", "true");
     }
   }
 
-  LoopAnalysisManager LAM;
-  FunctionAnalysisManager FAM;
-  CGSCCAnalysisManager CGAM;
-  ModuleAnalysisManager MAM;
-  PassBuilder PB(target_machine.get());
+  // Create the new analysis manager
+  llvm::LoopAnalysisManager LAM;
+  llvm::FunctionAnalysisManager FAM;
+  llvm::CGSCCAnalysisManager CGAM;
+  llvm::ModuleAnalysisManager MAM;
 
-  FAM.registerPass([&] { return target_machine->getTargetIRAnalysis(); });
+  // Create the new pass builder
+  llvm::PipelineTuningOptions PTO;
+  PTO.LoopInterleaving = false;
+  PTO.LoopVectorization = false;
+  PTO.SLPVectorization = true;
+  PTO.LoopUnrolling = false;
+  PTO.ForgetAllSCEVInLoopUnroll = true;
+
+  llvm::PassBuilder PB(target_machine.get(), PTO);
 
   PB.registerModuleAnalyses(MAM);
   PB.registerCGSCCAnalyses(CGAM);
@@ -217,35 +198,21 @@ std::string JITSessionCUDA::compile_module_to_ptx(
   PB.registerLoopAnalyses(LAM);
   PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
 
-  OptimizationLevel opt_level = OptimizationLevel::O3;
-  ModulePassManager MPM;
-  if (config_.opt_level > 0) {
-    MPM = PB.buildPerModuleDefaultPipeline(opt_level);
-  }
-
-  FunctionPassManager FPM;
-  FPM.addPass(createFunctionToLoopPassAdaptor(LoopStrengthReducePass()));
-  FPM.addPass(createFunctionToLoopPassAdaptor(IndVarSimplifyPass()));
-  FPM.addPass(SeparateConstOffsetFromGEPPass(false));
-  FPM.addPass(EarlyCSEPass(true));
-  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+  target_machine->registerPassBuilderCallbacks(PB);
 
-  SmallString<0> outstr;
-  raw_svector_ostream ostream(outstr);
-  ostream.SetUnbuffered();
-
-  target_machine->Options.MCOptions.AsmVerbose = true;
-
-  if (target_machine->addPassesToEmitFile(MPM, ostream, nullptr,
-                                         CodeGenFileType::AssemblyFile, true)) {
-    TI_ERROR("Failed to set up passes to emit PTX source\n");
-  }
+  llvm::ModulePassManager MPM =
+      PB.buildPerModuleDefaultPipeline(llvm::OptimizationLevel::O3);
 
   {
     TI_PROFILER("llvm_module_pass");
     MPM.run(*module, MAM);
   }
 
+  if (llvm::verifyModule(*module, &llvm::errs())) {
+    module->print(llvm::errs(), nullptr);
+    TI_ERROR("LLVM Module broken");
+  }
+
   if (this->config_.print_kernel_llvm_ir_optimized) {
     static FileSequenceWriter writer(
         "taichi_kernel_cuda_llvm_ir_optimized_{:04d}.ll",
@@ -253,7 +220,31 @@ std::string JITSessionCUDA::compile_module_to_ptx(
     writer.write(module.get());
   }
 
-  return std::string(outstr.str());
+  llvm::SmallString<8> outstr;
+  raw_svector_ostream ostream(outstr);
+  ostream.SetUnbuffered();
+
+  llvm::legacy::PassManager LPM;
+  LPM.add(createTargetTransformInfoWrapperPass(
+      target_machine->getTargetIRAnalysis()));
+
+  // Override default to generate verbose assembly.
+  target_machine->Options.MCOptions.AsmVerbose = true;
+
+#if LLVM_VERSION_MAJOR >= 18
+  const auto file_type = llvm::CodeGenFileType::AssemblyFile;
+#else
+  const auto file_type = llvm::CGFT_AssemblyFile;
+#endif
+  bool fail = target_machine->addPassesToEmitFile(LPM, ostream, nullptr,
+                                                  file_type, true);
+
+  TI_ERROR_IF(fail, "Failed to set up passes to emit PTX source\n");
+  LPM.run(*module);
+
+  std::string buffer(outstr.begin(), outstr.end());
+  buffer.push_back(0);
+  return buffer;
 }
 
 std::unique_ptr<JITSession> create_llvm_jit_session_cuda(
@@ -261,9 +252,17 @@ std::unique_ptr<JITSession> create_llvm_jit_session_cuda(
     const CompileConfig &config,
     Arch arch) {
   TI_ASSERT(arch == Arch::cuda);
+  // https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#data-layout
   auto data_layout = TaichiLLVMContext::get_data_layout(arch);
   return std::make_unique<JITSessionCUDA>(tlctx, config, data_layout);
 }
+#else
+std::unique_ptr<JITSession> create_llvm_jit_session_cuda(
+    TaichiLLVMContext *tlctx,
+    const CompileConfig &config,
+    Arch arch) {
+  TI_NOT_IMPLEMENTED
+}
 #endif
 
-}  // namespace taichi::lang
\ No newline at end of file
+}  // namespace taichi::lang
diff --git a/taichi/runtime/cuda/jit_cuda.h b/taichi/runtime/cuda/jit_cuda.h
index 653934b550883..4919cb67ad84f 100644
--- a/taichi/runtime/cuda/jit_cuda.h
+++ b/taichi/runtime/cuda/jit_cuda.h
@@ -9,7 +9,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/LLVMContext.h"
-// #include "llvm/IR/LegacyPassManager.h"            // Obsolete: Removed
+#include "llvm/IR/LegacyPassManager.h"            // Obsolete: Removed
 #include "llvm/IR/Verifier.h"
 // #include "llvm/Transforms/InstCombine/InstCombine.h"// Obsolete: Handled by NPM PassBuilder
 // #include "llvm/Transforms/Scalar.h"                 // Obsolete: Handled by NPM PassBuilder
@@ -119,4 +119,4 @@ std::unique_ptr<JITSession> create_llvm_jit_session_cuda(
     const CompileConfig &config,
     Arch arch);
 
-}  // namespace taichi::lang
\ No newline at end of file
+}  // namespace taichi::lang

From 26ae12c1eb9f631abd14d8312d968f9cd8a2abb0 Mon Sep 17 00:00:00 2001
From: Tiffany Mintz <Tiffany.Mintz@amd.com>
Date: Mon, 15 Sep 2025 16:00:35 -0500
Subject: [PATCH 25/42] updated AMD Instinct GPU jit implementation to llvm 20

---
 .github/workflows/scripts/ti_build/llvm.py | 13 +++-
 python/taichi/CHANGELOG.md                 |  1 +
 taichi/runtime/amdgpu/jit_amdgpu.cpp       | 91 +++++++++++++++++++++-
 taichi/runtime/amdgpu/jit_amdgpu.h         | 30 ++++---
 taichi/runtime/llvm/llvm_context_pass.h    | 23 ++++--
 5 files changed, 128 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/scripts/ti_build/llvm.py b/.github/workflows/scripts/ti_build/llvm.py
index 16cdc4a078f75..dd18709099af3 100644
--- a/.github/workflows/scripts/ti_build/llvm.py
+++ b/.github/workflows/scripts/ti_build/llvm.py
@@ -22,16 +22,21 @@ def setup_llvm() -> None:
     u = platform.uname()
     if (u.system, u.machine) == ("Linux", "x86_64"):
         if cmake_args.get_effective("TI_WITH_AMDGPU"):
-            out = get_cache_home() / "llvm15-amdgpu-005"
-            url = "https://github.com/GaleSeLee/assets/releases/download/v0.0.5/taichi-llvm-15.0.0-linux.zip"
+            # We should use LLVM toolchains shipped with OS.
+            #out = get_cache_home() / "llvm15-amdgpu-005"
+            #url = "https://github.com/GaleSeLee/assets/releases/download/v0.0.5/taichi-llvm-15.0.0-linux.zip"
+            os.environ["LLVM_DIR"] = "/usr/lib/llvm-20/cmake"
+            os.environ["ROCM_PATH"] = "/opt/rocm"
+            os.environ["CPATH"] = "/opt/rocm/include"
         elif is_manylinux2014():
             # FIXME: prebuilt llvm15 on ubuntu didn't work on manylinux2014 image of centos. Once that's fixed, remove this hack.
             out = get_cache_home() / "llvm15-manylinux2014"
             url = "https://github.com/ailzhang/torchhub_example/releases/download/0.3/taichi-llvm-15-linux.zip"
+            download_dep(url, out, strip=1)
         else:
             out = get_cache_home() / "llvm15"
             url = "https://github.com/taichi-dev/taichi_assets/releases/download/llvm15/taichi-llvm-15-linux.zip"
-        download_dep(url, out, strip=1)
+            download_dep(url, out, strip=1)
 
     elif (u.system, u.machine) in (("Linux", "arm64"), ("Linux", "aarch64")):
         out = get_cache_home() / "llvm15-manylinux2014"
@@ -54,7 +59,7 @@ def setup_llvm() -> None:
 
     # We should use LLVM toolchains shipped with OS.
      #path_prepend('PATH', out / 'bin')
-    if (u.system, u.machine) not in (("Linux", "arm64"), ("Linux", "aarch64")):
+    if ((u.system, u.machine) not in (("Linux", "arm64"), ("Linux", "aarch64"))) and not (cmake_args.get_effective("TI_WITH_AMDGPU")):
         os.environ["LLVM_DIR"] = "/usr/lib/llvm-20/cmake"
         os.environ["CUDA_HOME"] = "/usr/local/cuda"
         os.environ["CPATH"] = "/usr/local/cuda/include"
diff --git a/python/taichi/CHANGELOG.md b/python/taichi/CHANGELOG.md
index b09dcfba95353..6a0afcadbbd8f 100644
--- a/python/taichi/CHANGELOG.md
+++ b/python/taichi/CHANGELOG.md
@@ -4,6 +4,7 @@ Highlights:
       - Drop OpenGL build (#8751) (by **Proton**)
 
 Full changelog:
+   - fix build issues with llvm 20 update (by **Tiffany Mintz**)
    - additional updates for llvm 20 (by **Tiffany Mintz**)
    - additional cuda updates for llvm20; merging from 8ca16de9a24e82baaed1deac80f71bd541c131ca to add2df35782768f5ad243cacef0922ec00890ba5 from johnnynunez/taichi master branch (by **Tiffany Mintz**)
    - removing blackwell updates; restoring window_base.cpp include (by **Tiffany Mintz**)
diff --git a/taichi/runtime/amdgpu/jit_amdgpu.cpp b/taichi/runtime/amdgpu/jit_amdgpu.cpp
index 87903f0a35404..783fbcddd2fbf 100644
--- a/taichi/runtime/amdgpu/jit_amdgpu.cpp
+++ b/taichi/runtime/amdgpu/jit_amdgpu.cpp
@@ -64,12 +64,17 @@ std::string JITSessionAMDGPU::compile_module_to_hsaco(
   options.NoZerosInBSS = 0;
   options.GuaranteedTailCallOpt = 0;
 
+#if LLVM_VERSION_MAJOR >= 18
+  const auto opt_level = llvm::CodeGenOptLevel::Aggressive;
+#else
+  const auto opt_level = llvm::CodeGenOpt::Aggressive;
+#endif
+
   std::unique_ptr<llvm::TargetMachine> machine(target->createTargetMachine(
       triple_str, AMDGPUContext::get_instance().get_mcpu(), "", options,
-      llvm::Reloc::PIC_, llvm::CodeModel::Small, llvm::CodeGenOpt::Aggressive));
+      llvm::Reloc::PIC_, llvm::CodeModel::Small, opt_level));
 
   llvm_module->setDataLayout(machine->createDataLayout());
-
   if (this->config_.print_kernel_amdgcn) {
     // Amdgcn will not generated during generating hsaco file
     // It's an interim impl
@@ -84,11 +89,47 @@ std::string JITSessionAMDGPU::compile_module_to_hsaco(
     // another way
     //    llvm-objdump -d xxxx.hsaco(can ensure that hsaco and gcn correspond to
     //    each other)
-
     auto module_clone = llvm::CloneModule(*llvm_module);
-    llvm::legacy::PassManager module_gen_gcn_pass_manager;
     llvm::SmallString<0> gcnstr;
     llvm::raw_svector_ostream llvm_stream_gcn(gcnstr);
+#if LLVM_VERSION_MAJOR >= 17
+   // Create the new analysis manager
+    llvm::LoopAnalysisManager LAM;
+    llvm::FunctionAnalysisManager FAM;
+    llvm::CGSCCAnalysisManager CGAM;
+    llvm::ModuleAnalysisManager MAM;
+
+  // Create the new pass builder
+    llvm::PipelineTuningOptions PTO;
+    PTO.LoopInterleaving = false;
+    PTO.LoopVectorization = false;
+    PTO.SLPVectorization = true;
+    PTO.LoopUnrolling = false;
+    PTO.ForgetAllSCEVInLoopUnroll = true;
+
+    std::unique_ptr<llvm::TargetMachine> machine_gen_gcn(
+        target->createTargetMachine(
+            triple_str, AMDGPUContext::get_instance().get_mcpu(), "", options,
+            llvm::Reloc::PIC_, llvm::CodeModel::Small,
+            opt_level));
+
+    llvm::PassBuilder module_gen_gcn_pass_manager(machine_gen_gcn.get(), PTO);
+
+    module_gen_gcn_pass_manager.registerModuleAnalyses(MAM);
+    module_gen_gcn_pass_manager.registerCGSCCAnalyses(CGAM);
+    module_gen_gcn_pass_manager.registerFunctionAnalyses(FAM);
+    module_gen_gcn_pass_manager.registerLoopAnalyses(LAM);
+    module_gen_gcn_pass_manager.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+    llvm::ModulePassManager builder =
+      module_gen_gcn_pass_manager.buildPerModuleDefaultPipeline(llvm::OptimizationLevel::O3);
+
+    machine_gen_gcn->registerPassBuilderCallbacks(module_gen_gcn_pass_manager);
+
+    builder.run(*module_clone, MAM);
+#else
+
+    llvm::legacy::PassManager module_gen_gcn_pass_manager;
     std::unique_ptr<llvm::TargetMachine> machine_gen_gcn(
         target->createTargetMachine(
             triple_str, AMDGPUContext::get_instance().get_mcpu(), "", options,
@@ -105,11 +146,43 @@ std::string JITSessionAMDGPU::compile_module_to_hsaco(
                                          llvm_stream_gcn, nullptr,
                                          llvm::CGFT_AssemblyFile, true);
     module_gen_gcn_pass_manager.run(*module_clone);
+#endif
     std::string gcn(gcnstr.begin(), gcnstr.end());
     static FileSequenceWriter writer("taichi_kernel_amdgcn_{:04d}.gcn",
                                      "module AMDGCN");
     writer.write(gcn);
   }
+#if LLVM_VERSION_MAJOR >= 17
+
+   // Create the new analysis manager
+    llvm::LoopAnalysisManager LAM;
+    llvm::FunctionAnalysisManager FAM;
+    llvm::CGSCCAnalysisManager CGAM;
+    llvm::ModuleAnalysisManager MAM;
+
+  // Create the new pass builder
+    llvm::PipelineTuningOptions PTO;
+    PTO.LoopInterleaving = false;
+    PTO.LoopVectorization = false;
+    PTO.SLPVectorization = true;
+    PTO.LoopUnrolling = false;
+    PTO.ForgetAllSCEVInLoopUnroll = true;
+    
+    llvm::PassBuilder module_pass_manager(machine.get(), PTO);
+
+    module_pass_manager.registerModuleAnalyses(MAM);
+    module_pass_manager.registerCGSCCAnalyses(CGAM);
+    module_pass_manager.registerFunctionAnalyses(FAM);
+    module_pass_manager.registerLoopAnalyses(LAM);
+    module_pass_manager.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+    llvm::ModulePassManager builder =
+      module_pass_manager.buildPerModuleDefaultPipeline(llvm::OptimizationLevel::O3);
+
+    machine->registerPassBuilderCallbacks(module_pass_manager);
+
+    builder.run(*llvm_module, MAM);
+#else
 
   llvm::legacy::FunctionPassManager function_pass_manager(llvm_module.get());
   llvm::legacy::PassManager module_pass_manager;
@@ -126,6 +199,7 @@ std::string JITSessionAMDGPU::compile_module_to_hsaco(
   machine->adjustPassManager(builder);
   builder.populateFunctionPassManager(function_pass_manager);
   builder.populateModulePassManager(module_pass_manager);
+#endif
 
   machine->Options.MCOptions.AsmVerbose = true;
 
@@ -142,6 +216,14 @@ std::string JITSessionAMDGPU::compile_module_to_hsaco(
   llvm::SmallString<0> outstr;
   llvm::raw_svector_ostream llvm_stream(outstr);
 
+#if LLVM_VERSION_MAJOR >= 17
+  llvm::legacy::PassManager legacy_pass_manager;
+  legacy_pass_manager.add(llvm::createTargetTransformInfoWrapperPass(
+      machine->getTargetIRAnalysis()));
+  machine->addPassesToEmitFile(legacy_pass_manager, llvm_stream, nullptr,
+                                llvm::CodeGenFileType::AssemblyFile, true);
+  legacy_pass_manager.run(*llvm_module);
+#else
   machine->addPassesToEmitFile(module_pass_manager, llvm_stream, nullptr,
                                llvm::CGFT_ObjectFile, true);
 
@@ -150,6 +232,7 @@ std::string JITSessionAMDGPU::compile_module_to_hsaco(
     function_pass_manager.run(*func);
   function_pass_manager.doFinalization();
   module_pass_manager.run(*llvm_module);
+#endif
 
   std::string obj_str(outstr.begin(), outstr.end());
   std::ofstream(obj_path) << obj_str;
diff --git a/taichi/runtime/amdgpu/jit_amdgpu.h b/taichi/runtime/amdgpu/jit_amdgpu.h
index 87ea552244e4e..5d798663a6b37 100644
--- a/taichi/runtime/amdgpu/jit_amdgpu.h
+++ b/taichi/runtime/amdgpu/jit_amdgpu.h
@@ -13,26 +13,24 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/LLVMContext.h"
-// #include "llvm/IR/LegacyPassManager.h"            // Obsolete: Removed
 #include "llvm/IR/Verifier.h"
-// #include "llvm/Transforms/InstCombine/InstCombine.h"// Obsolete: Handled by NPM PassBuilder
-// #include "llvm/Transforms/Scalar.h"                 // Obsolete: Handled by NPM PassBuilder
-// #include "llvm/Transforms/Scalar/GVN.h"             // Obsolete: Handled by NPM PassBuilder
-// #include "llvm/Transforms/IPO.h"                    // Obsolete: Handled by NPM PassBuilder
-// #include "llvm/Transforms/IPO/PassManagerBuilder.h" // Obsolete: Removed
+#if LLVM_VERSION_MAJOR >= 17
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Passes/OptimizationLevel.h"
+#include "llvm/Transforms/IPO/Inliner.h"
+#include "llvm/IR/Module.h"
+#else
+#include "llvm/IR/LegacyPassManager.h"            // Obsolete: Removed
+#include "llvm/Transforms/InstCombine/InstCombine.h"// Obsolete: Handled by NPM PassBuilder
+#include "llvm/Transforms/Scalar.h"                 // Obsolete: Handled by NPM PassBuilder
+#include "llvm/Transforms/Scalar/GVN.h"             // Obsolete: Handled by NPM PassBuilder
+#include "llvm/Transforms/IPO.h"                    // Obsolete: Handled by NPM PassBuilder
+#include "llvm/Transforms/IPO/PassManagerBuilder.h" // Obsolete: Removed
+#endif
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/MC/TargetRegistry.h"
-// Note: TargetMachine is already included above
-// #include "llvm/Target/TargetMachine.h"
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 
-// === CHANGED SECTION: HEADER INCLUDES ===
-// Add the main header for the New Pass Manager. This will be needed
-// by the implementation file (jit_amdgpu.cpp) to build the pass pipeline.
-#include "llvm/Passes/PassBuilder.h"
-// === END OF CHANGED SECTION ===
-
-
 #include "taichi/rhi/amdgpu/amdgpu_context.h"
 #include "taichi/rhi/amdgpu/amdgpu_driver.h"
 #include "taichi/jit/jit_session.h"
@@ -159,4 +157,4 @@ std::unique_ptr<JITSession> create_llvm_jit_session_amdgpu(
     Arch arch);
 
 }  // namespace lang
-}  // namespace taichi
\ No newline at end of file
+}  // namespace taichi
diff --git a/taichi/runtime/llvm/llvm_context_pass.h b/taichi/runtime/llvm/llvm_context_pass.h
index e68cb5321a6a3..536b473d21c48 100644
--- a/taichi/runtime/llvm/llvm_context_pass.h
+++ b/taichi/runtime/llvm/llvm_context_pass.h
@@ -248,13 +248,13 @@ struct AMDGPUConvertFuncParamAddressSpacePass : public ModulePass {
       std::vector<llvm::Type *> new_func_params;
       for (auto &arg : f->args()) {
         if (arg.getType()->getTypeID() == llvm::Type::PointerTyID) {
-          // This is a temporary LLVM interface to handle transition from typed
-          // pointer to opaque pointer In the future, if we only clang++ > 14,
-          // we can compeletely comply to opaque pointer and replace the
-          // following code with llvm::PointerType::get(M.getContext(),
-          // usigned(1))
+#if LLVM_VERSION_MAJOR >= 16
+          auto new_type = llvm::PointerType::get(M.getContext(), unsigned(1));
+#else
           auto new_type = llvm::PointerType::getWithSamePointeeType(
               llvm::dyn_cast<llvm::PointerType>(arg.getType()), unsigned(1));
+#endif
+
 
           new_func_params.push_back(new_type);
         } else {
@@ -273,17 +273,28 @@ struct AMDGPUConvertFuncParamAddressSpacePass : public ModulePass {
       new_func->setComdat(f->getComdat());
       f->getParent()->getFunctionList().insert(f->getIterator(), new_func);
       new_func->takeName(f);
+#if LLVM_VERSION_MAJOR >= 16
+      new_func->splice(new_func->begin(), f);
+#else
       new_func->getBasicBlockList().splice(new_func->begin(),
                                            f->getBasicBlockList());
+#endif
       for (llvm::Function::arg_iterator I = f->arg_begin(), E = f->arg_end(),
                                         I2 = new_func->arg_begin();
            I != E; ++I, ++I2) {
         if (I->getType()->getTypeID() == llvm::Type::PointerTyID) {
+#if LLVM_VERSION_MAJOR >= 16
+	  auto &front_bb = new_func->getEntryBlock();
+          llvm::Instruction *addrspacecast =
+              new AddrSpaceCastInst(I2, I->getType());
+          front_bb.getFirstInsertionPt()->insertAfter(addrspacecast);
+#else
           auto &front_bb = new_func->getBasicBlockList().front();
           llvm::Instruction *addrspacecast =
               new AddrSpaceCastInst(I2, I->getType());
           front_bb.getInstList().insertAfter(front_bb.getFirstInsertionPt(),
                                              addrspacecast);
+#endif
           I->replaceAllUsesWith(addrspacecast);
           I2->takeName(&*I);
         } else {
@@ -301,4 +312,4 @@ struct AMDGPUConvertFuncParamAddressSpacePass : public ModulePass {
 #endif
 
 }  // namespace lang
-}  // namespace taichi
\ No newline at end of file
+}  // namespace taichi

From 514446e79ed8312e9f3456efb2c13f31cdcb73b5 Mon Sep 17 00:00:00 2001
From: Tiffany Mintz <Tiffany.Mintz@amd.com>
Date: Tue, 16 Sep 2025 12:42:17 -0500
Subject: [PATCH 26/42] updating amd gpu kernel code generation to llvm 20

---
 python/taichi/CHANGELOG.md               |  1 +
 taichi/codegen/amdgpu/codegen_amdgpu.cpp | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/python/taichi/CHANGELOG.md b/python/taichi/CHANGELOG.md
index 6a0afcadbbd8f..4f925437b48d5 100644
--- a/python/taichi/CHANGELOG.md
+++ b/python/taichi/CHANGELOG.md
@@ -4,6 +4,7 @@ Highlights:
       - Drop OpenGL build (#8751) (by **Proton**)
 
 Full changelog:
+   - updated AMD Instinct GPU jit implementation to llvm 20 (by **Tiffany Mintz**)
    - fix build issues with llvm 20 update (by **Tiffany Mintz**)
    - additional updates for llvm 20 (by **Tiffany Mintz**)
    - additional cuda updates for llvm20; merging from 8ca16de9a24e82baaed1deac80f71bd541c131ca to add2df35782768f5ad243cacef0922ec00890ba5 from johnnynunez/taichi master branch (by **Tiffany Mintz**)
diff --git a/taichi/codegen/amdgpu/codegen_amdgpu.cpp b/taichi/codegen/amdgpu/codegen_amdgpu.cpp
index 5056334857552..cfecb422ebd5d 100644
--- a/taichi/codegen/amdgpu/codegen_amdgpu.cpp
+++ b/taichi/codegen/amdgpu/codegen_amdgpu.cpp
@@ -110,7 +110,11 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
         builder->CreateBr(bb_merge);
         bb_oeq_then = builder->GetInsertBlock();
 
+#if LLVM_VERSION_MAJOR >= 17
+	bb_oeq_else->insertInto(func);
+#else
         func->getBasicBlockList().push_back(bb_oeq_else);
+#endif
         builder->SetInsertPoint(bb_oeq_else);
         auto fcmp_olt = builder->CreateFCmpOLT(
             input,
@@ -125,7 +129,11 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
         builder->CreateBr(bb_merge);
         bb_olt_then = builder->GetInsertBlock();
 
+#if LLVM_VERSION_MAJOR >= 17
+        bb_olt_else->insertInto(func);
+#else
         func->getBasicBlockList().push_back(bb_olt_else);
+#endif
         builder->SetInsertPoint(bb_olt_else);
         builder->CreateStore(
             llvm::ConstantFP::get(llvm::Type::getFloatTy(*llvm_context), 1),
@@ -133,7 +141,11 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
         builder->CreateBr(bb_merge);
         bb_olt_else = builder->GetInsertBlock();
 
+#if LLVM_VERSION_MAJOR >= 17
+	bb_merge->insertInto(func);
+#else
         func->getBasicBlockList().push_back(bb_merge);
+#endif
         builder->SetInsertPoint(bb_merge);
         llvm_val[stmt] =
             builder->CreateLoad(llvm::Type::getFloatTy(*llvm_context), cast);
@@ -161,7 +173,11 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
         builder->CreateBr(bb_merge);
         bb_oeq_then = builder->GetInsertBlock();
 
+#if LLVM_VERSION_MAJOR >= 17
+	bb_oeq_else->insertInto(func);
+#else
         func->getBasicBlockList().push_back(bb_oeq_else);
+#endif
         builder->SetInsertPoint(bb_oeq_else);
         auto fcmp_olt = builder->CreateFCmpOLT(
             input,
@@ -176,7 +192,11 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
         builder->CreateBr(bb_merge);
         bb_olt_then = builder->GetInsertBlock();
 
+#if LLVM_VERSION_MAJOR >= 17
+	bb_olt_else->insertInto(func);
+#else
         func->getBasicBlockList().push_back(bb_olt_else);
+#endif
         builder->SetInsertPoint(bb_olt_else);
         builder->CreateStore(
             llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), 1),
@@ -184,7 +204,11 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
         builder->CreateBr(bb_merge);
         bb_olt_else = builder->GetInsertBlock();
 
+#if LLVM_VERSION_MAJOR >= 17
+	bb_merge->insertInto(func);
+#else
         func->getBasicBlockList().push_back(bb_merge);
+#endif
         builder->SetInsertPoint(bb_merge);
         llvm_val[stmt] =
             builder->CreateLoad(llvm::Type::getDoubleTy(*llvm_context), cast);

From 2a6adb04d2c396da2e7ecb3789a57f09143331aa Mon Sep 17 00:00:00 2001
From: Tiffany Mintz <Tiffany.Mintz@amd.com>
Date: Thu, 25 Sep 2025 10:24:02 -0500
Subject: [PATCH 27/42] fix object file type; setting llvm dir based on
 environment var

---
 .github/workflows/scripts/ti_build/llvm.py | 7 ++-----
 CMakeLists.txt                             | 2 +-
 cmake/TaichiCXXFlags.cmake                 | 4 ++--
 cmake/TaichiCore.cmake                     | 2 +-
 python/taichi/CHANGELOG.md                 | 1 +
 taichi/runtime/amdgpu/jit_amdgpu.cpp       | 2 +-
 taichi/runtime/llvm/llvm_context_pass.h    | 3 ++-
 7 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/scripts/ti_build/llvm.py b/.github/workflows/scripts/ti_build/llvm.py
index dd18709099af3..43376cd3875a6 100644
--- a/.github/workflows/scripts/ti_build/llvm.py
+++ b/.github/workflows/scripts/ti_build/llvm.py
@@ -23,11 +23,8 @@ def setup_llvm() -> None:
     if (u.system, u.machine) == ("Linux", "x86_64"):
         if cmake_args.get_effective("TI_WITH_AMDGPU"):
             # We should use LLVM toolchains shipped with OS.
-            #out = get_cache_home() / "llvm15-amdgpu-005"
-            #url = "https://github.com/GaleSeLee/assets/releases/download/v0.0.5/taichi-llvm-15.0.0-linux.zip"
-            os.environ["LLVM_DIR"] = "/usr/lib/llvm-20/cmake"
-            os.environ["ROCM_PATH"] = "/opt/rocm"
-            os.environ["CPATH"] = "/opt/rocm/include"
+            os.environ["LLVM_DIR"] = os.environ["LLVM_PATH"]+"/lib/cmake"
+            os.environ["CPATH"] = os.environ["ROCM_PATH"]+"/include"
         elif is_manylinux2014():
             # FIXME: prebuilt llvm15 on ubuntu didn't work on manylinux2014 image of centos. Once that's fixed, remove this hack.
             out = get_cache_home() / "llvm15-manylinux2014"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e97e6da2e8fa8..96d6c92301471 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -156,7 +156,7 @@ if (TI_WITH_CUDA)
 endif()
 
 if (TI_WITH_AMDGPU)
-    set(AMDGPU_ARCH "amdgpu")
+	set(AMDGPU_ARCH "amdgpu")
 endif()
 
 if (TI_WITH_DX12)
diff --git a/cmake/TaichiCXXFlags.cmake b/cmake/TaichiCXXFlags.cmake
index 58b9a5a3ded3d..8d6b56718565d 100644
--- a/cmake/TaichiCXXFlags.cmake
+++ b/cmake/TaichiCXXFlags.cmake
@@ -56,8 +56,8 @@ if (WIN32)
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267 /wd4624 /nologo /D \"_CRT_SECURE_NO_WARNINGS\" /D \"_ENABLE_EXTENDED_ALIGNED_STORAGE\"")
     else()
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -fsized-deallocation -target x86_64-pc-windows-msvc")
-        set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -gcodeview")
-        set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -gcodeview")
+        set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -gcodeview -g")
+        set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -gcodeview -g")
     endif()
 else()
     if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
index b716b071d48fc..0c22d7331dcdc 100644
--- a/cmake/TaichiCore.cmake
+++ b/cmake/TaichiCore.cmake
@@ -106,7 +106,7 @@ if (TI_WITH_CUDA)
 endif()
 
 if (TI_WITH_AMDGPU)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_AMDGPU")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -DTI_WITH_AMDGPU")
   file(GLOB TAICHI_AMDGPU_RUNTIME_SOURCE "taichi/runtime/amdgpu/runtime.cpp")
   list(APPEND TAIHI_CORE_SOURCE ${TAICHI_AMDGPU_RUNTIME_SOURCE})
 endif()
diff --git a/python/taichi/CHANGELOG.md b/python/taichi/CHANGELOG.md
index 4f925437b48d5..7b99f10104069 100644
--- a/python/taichi/CHANGELOG.md
+++ b/python/taichi/CHANGELOG.md
@@ -4,6 +4,7 @@ Highlights:
       - Drop OpenGL build (#8751) (by **Proton**)
 
 Full changelog:
+   - updating amd gpu kernel code generation to llvm 20 (by **Tiffany Mintz**)
    - updated AMD Instinct GPU jit implementation to llvm 20 (by **Tiffany Mintz**)
    - fix build issues with llvm 20 update (by **Tiffany Mintz**)
    - additional updates for llvm 20 (by **Tiffany Mintz**)
diff --git a/taichi/runtime/amdgpu/jit_amdgpu.cpp b/taichi/runtime/amdgpu/jit_amdgpu.cpp
index 783fbcddd2fbf..b7a8c349886e2 100644
--- a/taichi/runtime/amdgpu/jit_amdgpu.cpp
+++ b/taichi/runtime/amdgpu/jit_amdgpu.cpp
@@ -221,7 +221,7 @@ std::string JITSessionAMDGPU::compile_module_to_hsaco(
   legacy_pass_manager.add(llvm::createTargetTransformInfoWrapperPass(
       machine->getTargetIRAnalysis()));
   machine->addPassesToEmitFile(legacy_pass_manager, llvm_stream, nullptr,
-                                llvm::CodeGenFileType::AssemblyFile, true);
+                                llvm::CodeGenFileType::ObjectFile, true);
   legacy_pass_manager.run(*llvm_module);
 #else
   machine->addPassesToEmitFile(module_pass_manager, llvm_stream, nullptr,
diff --git a/taichi/runtime/llvm/llvm_context_pass.h b/taichi/runtime/llvm/llvm_context_pass.h
index 536b473d21c48..0fcc06ab3f673 100644
--- a/taichi/runtime/llvm/llvm_context_pass.h
+++ b/taichi/runtime/llvm/llvm_context_pass.h
@@ -287,7 +287,8 @@ struct AMDGPUConvertFuncParamAddressSpacePass : public ModulePass {
 	  auto &front_bb = new_func->getEntryBlock();
           llvm::Instruction *addrspacecast =
               new AddrSpaceCastInst(I2, I->getType());
-          front_bb.getFirstInsertionPt()->insertAfter(addrspacecast);
+	  addrspacecast->insertAfter(front_bb.getFirstInsertionPt());
+          //front_bb.getFirstInsertionPt()->insertAfter(addrspacecast);
 #else
           auto &front_bb = new_func->getBasicBlockList().front();
           llvm::Instruction *addrspacecast =

From 55163602821e1416f37c70406f5f1f2fec543707 Mon Sep 17 00:00:00 2001
From: Tiffany Mintz <Tiffany.Mintz@amd.com>
Date: Thu, 25 Sep 2025 10:26:47 -0500
Subject: [PATCH 28/42] adding bitcode for gfx940,gfx941,gfx942,gfx950

---
 .../amdgpu_libdevice/oclc_isa_version_940.bc     | Bin 0 -> 2276 bytes
 .../amdgpu_libdevice/oclc_isa_version_941.bc     | Bin 0 -> 2276 bytes
 .../amdgpu_libdevice/oclc_isa_version_942.bc     | Bin 0 -> 2276 bytes
 .../amdgpu_libdevice/oclc_isa_version_950.bc     | Bin 0 -> 2276 bytes
 4 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 external/amdgpu_libdevice/oclc_isa_version_940.bc
 create mode 100644 external/amdgpu_libdevice/oclc_isa_version_941.bc
 create mode 100644 external/amdgpu_libdevice/oclc_isa_version_942.bc
 create mode 100644 external/amdgpu_libdevice/oclc_isa_version_950.bc

diff --git a/external/amdgpu_libdevice/oclc_isa_version_940.bc b/external/amdgpu_libdevice/oclc_isa_version_940.bc
new file mode 100644
index 0000000000000000000000000000000000000000..dd3c79606ca6d91edea2abe6a3ee23f8366a78cd
GIT binary patch
literal 2276
zcma)7eN0=|6~D&d>wxo~>D(5NXZM+ph$%~X7!vH*)@(nznU$z6Z6zvg!q3JIG<-aN
z#DFtRHfDXKvnnU$4@=D?yNR~7`-iM(${$%3^H}Dl7NgZPA&nw?Kt48_u$7@otF)bG
zLf8GbYn^lNdFS49f9K;KJK0+sCp8GwAcVXsrsd45AN>5xzZPF;EVT5<j5WxhLg?=@
zgzBmcs0#LIfpDVvlA}=Ft56%wmu0i6)*7W&U;R^WT=`s;^KzZDM{U%Ylnu|<8k^=^
z{Ia3B=LsRDc3iF+Dk!gOX!g5v^oKHI)9e6|t~d0gb2IN$X+`|eliH?Ks#UE!9VsAm
zkw&N%?rI8DtAFjBxv1_3mJTktUA0G!Id9iEn~o`8|FFvYCQ>ORT;F(yAI2e*5afkZ
zN+C)ga2&}9{F+_Ay{g#F;;0?RJU9Ytb!AccGKR=d7KdpZV{iymOl48|0;<T@Pr3g`
z75(}t9>>%;ig1j^=RJq~z+hZbg$_B`oX+B48pEX_HwGgI8~}`^iLVgugEiRZ!DAjW
zWPglp9}FG<^N<lRSn*{)UD-+FsZ|A>nx-G$E*<>JKB(L{2&xJPRS;pxtQzyQ6p(!y
z^L9K1U=Rk&V<iIc9N2%q;ziDl$N1Xds^Vdqh=H?l982Txlgj$b_SaLIdRdn<bLH6R
zZ+`o!_ZOf3O{bx2-eQj0krEP<s7CQ+MPg|mis)4uVfbX!XH?qyzI5uyvG3ikX5TBd
z%HQx*XRB*-G`ik^&<gD7RK}_0G8NFsY;&w>R!lkLczOzlGeOl|M1H0w_TogR%Bf$<
z>sE49fT^i$OEwnewgPKQ$!*1{RD{PDlz7@D@PL??N2dgy7PQ5BVkb?M8FHK<H+P9$
zH_@qH)@>N5HG^)6(+8M~mYYe-Dr;VpTW=<<vvNyuR}=N%3&Xg3SO@`+3w&DOJviEi
zqhHj3*~z#ts1$+?LMS6lXA<9jxJx`N5&yhJ>@ad~AxCZHsA7e$bJQIJwVcr1;S_&L
zv*rS8&Q$mf(KOGR=H!-nu^le`a>nvUxn)6A{eDE?2ZRuxj11!`j}U1PB3VJZ@)G$F
zk^gWLTWMm`PL^q62fU!$bnAKjl0*NAfm%=K*Gtq^S@&0&KG5)@X@Na`C21+TTOK>&
z5hkbbROZFLy)-FBK>wQl4oBTL=r;1YAItO-V1sU#Q@s3>q^T%c7FZhu<jW=N7HgRU
zQB3`=7CaRu0gn>L+Hfe7_`#-{*ttb)yNS=4=KLoP{Z{$uwX$vl0JyZ4g`{mU**Ghj
zQmnZsT5sgd1rdC-7F~-$B?eSM2;hLA{qt>N$3uQrQtZ;?HbZXH<i492R}-5I0pjN+
z#e)*D<tDyhn*XxHQA-KkTKVZ6gYK?Dw`|Zs2Jai53D7^Y++=Mx<mUOLxj1kWvJcM2
z0H;fs<OS_V_lQRfSuPO|?Bs(i+#<^4;a_6#We~arFGV=A3aL!YI?N>zLT7_a746Ge
z-JP7S<j`;AsX&X5A<9yINied_u!iLM1{`(?gIVE6aX5Q@US>^+)_Kvi%ASUDDni6m
zF2j>fP7hF5^1A+nCeU}5As*4>qmts0o7|I35!)rjU79Sfg0olc$>=biRtnnwZ{83s
zP>0u8QxRe)H>Z=<qTC9>oEKF+M+**WC4UsktO!qRnntJ>-cLyHqS-l>_DIO+qMH+`
zOP4<G$*S9itQ|(@n`y&UllnWdwp<Hxoav!`qulseI_eLFM<&K;Bl9f$&W=Rrw&6&G
z51(#t2VwZU_gUY>SbIM=<ewOSf%lKI6Jz}Oh`)VwG&**I4^6!0_eJQ?gzrSRbe=Yz
z?Cj{G16_=%J8;U|Wi$mC|Ea*q?vs8m=QD98uG`o(#Cbc6okPqi|F<T6!(3=EJQDM}
zds({AAN7x-W@UnzQLCDjvW^+G2BeOpUDH{eS0xQpr>bBtxI8ocvE=b^qc1!Sbp7~S
zZ$Ez_+}LXG=X~#<3->l34_|F;bs4>qG;*Q&SH9!n4qu(+?Z)TaPa*U!^qLb8tGe1y
z8@yp08{jigH&V*YtK!UE>UQqveTURyAZG_}<EgGq4{^^8PmbJ@9v*qmP3$ei6L`9<
z(h;TvsAlOs4DeS()1u6BEoqw<tIk1lq1MaPI!$eH62G6&FLBgrjw&1!F_)K-dysgj
zgE&z)cTh|<e=tGmtU-eip_DD~a-gpj<Q9O8K2{^7fEgF9*OHbi&^Y9_Y|;u%BrQT2
zpdgJZ|JO!B-Bb;-H2m;i{tV1d$me$@SuU-Cj6i)r(zyX3X-Kmq^O69g2dEzC6`&(P
z(m81y>ZP@`|8N}YrTO15>Lsu+eh3&EVFnBzNb=y|gm2V0*!z-W@NohDxBmX$;^D@I
Y&ickrz$DRdn5&fJ=*ajv=>~-U1A}?*hX4Qo

literal 0
HcmV?d00001

diff --git a/external/amdgpu_libdevice/oclc_isa_version_941.bc b/external/amdgpu_libdevice/oclc_isa_version_941.bc
new file mode 100644
index 0000000000000000000000000000000000000000..f9bc8f503dbfac27e76605bb576f0936b852a105
GIT binary patch
literal 2276
zcma)7eN0=|6~D&d>wxo~>D(5NXZM+ph$%~X7!vH*)@(nznU$z6Z6zvg!q3JIG<-aN
z#DFtRHfDXKvnnU$4@=D?yNR~7`-iM(${$%3^H}Dl7NgZPA&nw?Kt48_u$7@otF)bG
zLf8GbYn^lNdFS49f9K;KJK0+sCp8GwAcVXsrsd45AN>5xzZPF;EVT5<j5WxhLg?=@
zgzBmcs0#LIfpDVvlA}=Ft56%wmu0i6)*7W&U;R^WT=`s;^KzZDM{U%Ylnu|<8k^=^
z{Ia3B=LsRDc3iF+Dk!gOX!g5v^oKHI)9e6|t~d0gb2IN$X+`|eliH?Ks#UE!9VsAm
zkw&N%?rI8DtAFjBxv1_3mJTktUA0G!Id9iEn~o`8|FFvYCQ>ORT;F(yAI2e*5afkZ
zN+C)ga2&}9{F+_Ay{g#F;;0?RJU9Ytb!AccGKR=d7KdpZV{iymOl48|0;<T@Pr3g`
z75(}t9>>%;ig1j^=RJq~z+hZbg$_B`oX+B48pEX_HwGgI8~}`^iLVgugEiRZ!DAjW
zWPglp9}FG<^N<lRSn*{)UD-+FsZ|A>nx-G$E*<>JKB(L{2&xJPRS;pxtQzyQ6p(!y
z^L9K1U=Rk&V<iIc9N2%q;ziDl$N1Xds^Vdqh=H?l982Txlgj$b_SaLIdRdn<bLH6R
zZ+`o!_ZOf3O{bx2-eQj0krEP<s7CQ+MPg|mis)4uVfbX!XH?qyzI5uyvG3ikX5ZUt
zmA~Ps&Q{mvXmq^+p%vKEsf<(0Wh$VN+2&Z&teA4f@$?i9XM(D`i2O`V?8S*rl~ccz
z*RABJ08>-hmTWA_Z3WhrlG}<?sR)lRDDkvQ-~ll&k4_0ZEoh7N#7>$hGvqi!ZtfDh
zZlY7YtlKb9YX;pCrw=d}EjN>vRo1*Hx86)zXXTdSt|sci7lv{7un+<s7x=WmdvLT3
zN57~6vy*XQP$>i(giuD9&LqD5aF=*kBK~=c*kR<}LXO(XQN;>h=cqddYB{01!zuoh
zX3YiGoT=~|qG_Hr&B-nEVmn;=<&5Qza?66K`u&K&4+tSX85zb?9wE{oM6!Z*<t6eV
zBLCqgw$j9=oh;MD4tPPg>DKf5C5QeK1GS#eua~H;vhJ@keW2k*(*k??O43qvw>);l
zBTP==smzOgdudXNfc`c89gezh&~4;(KbGkwzy{qer+E1%NmEg@EU-2R$d^mjE!Hvz
zqL}(!EqE$S0v;ufwc$`E@q<k@v2%;qb`zg7&G}Cp`mOTQYh~RA0B~t73rX8zvT;^4
zrC4)OwBE>@3nKVvExHziN(`ui5WoRJ`{&!lj)(lLq}Zj&ZHC;Y$$d95t|m4a0>sZt
ziU%cP%T0X2H2-CXqm~l7wer(D2HjnQZrPxN4Bj_96QF-)xyjmY$j$Rfb8+A%WFMT3
z0Zx}N$qU+#?h%g|vRon_*vSW3xJ8u7!@tDf%OG?MUW#yJ6;hd)b(l*cgw6(;D%zK`
zx;r^t$)VrKQ-KyALzJcbl3-+;VGYUi4LIx)2D8GC;&Ar*yv&*st@EO3l|2pRRD_7B
zT!tr|oF1UA<aPZCO`z{ALp-9%M<vB0H@PR7BDPD4yEIu|1!u3?lhI*3trWEV-@GAO
zpboFGrXs{pZcZnyMY$D%IWMYujusr$O8zL6SrMMtG>uR%yq}QXMYD4%?U9htMK>o>
zmo9zWlU273Sv!o*H`9izCiQn@ZMhcYIMYM>M!E5`bkrXTk4%iyM&?=gogInLZNrfW
zA3ojQ4#Mzx@3X#%vG#s$$Uiau0`DJZC&u{m5r6yWXmsoZADVd0?~Bl(3Ezos={#*b
z+1b%W2f7$jci@z_%V-KP{!@XI-6#EC&S&CGT(_}ni1T(BJBOH4{%=kChPlvScqHa`
z_p)@KKk6Sv&B_EbqgFL5WgRnW4M-hHyQZ@`uSy!IPF2BPaCv6>W69&;MqhXu==$-u
z-hTc<xUtpV&-vaz7w&C79=_Vx>N0vIY2-rluYAYD9lkot+l|k;pF-$e=rt!GR&}+Z
zHh9B0Ho#|~Zlsi(SH+pT)a~5S`wpqaK+X={##3FJ9^#%Go*cO)Jv{QBo7h{3C-8Jz
zr6WuUP|ebN7~rpnrbU_MTGBQzR-J?9Lampnb(-4ZBz`}kU*f3M991|dVlFQu_aO05
z2XUfq?x2`z{$PUAS%U^4LMdC|<v?F6$SnXFeXK@E0W&UIuO%&4pmE4;*`yVkNLqw4
zKtUQ+{;!RMx~UpuY53v4{27>^kk9W*vRqmN8G-tMq;ms6(vW6J<|P3}4^TbOD?mqp
zq;t|Z)Jtn=|KT{)OY^^B)JtGt{17lS!VDNbkmSL^3E!x1u=gd$;Nt@PZ~gth#lwva
Yo%M~MfJvg^Fjpzb(UI|U(hUgx2b}ruivR!s

literal 0
HcmV?d00001

diff --git a/external/amdgpu_libdevice/oclc_isa_version_942.bc b/external/amdgpu_libdevice/oclc_isa_version_942.bc
new file mode 100644
index 0000000000000000000000000000000000000000..8c9446e4ed30e94cacf67fffb71dcef352771a87
GIT binary patch
literal 2276
zcma)7eN0=|6~D&d>wxo~>D(5NXZM+ph$%~X7!vH*)@(nznU$z6-6Se)!q3JIG<-aN
z#DFtRHfDXKvnnU$4@=D?yNUMs4_VQaKe8(3vCK^^MyqK;8b$ViglsfnD?^o5X*<t^
zuKRD-I_KW=&b{aU&c{7=vUfI4YY?hI2zgaZ%ehzI`{}uVExy!PXz7s|Ymh;O(BEYU
z)m0f#73|Rh;Y9NlN1?h`p*EVY%4SuqHA=0%`p4e5@`Wnr)jDU7+Nd!p8(yq6HqE*C
zWkYk%Q$kAZxLP$-P~Ozg>^JA=_hrVW*#RP5Z|F(qX5OjNiui-4wN0s1t6Fz9Qb6c3
zjZiJz)fA{!|JpfoS=|pT9b9s|YLA_8-mP&qolw61ewFu4q*6$@zVZ$~j6)_N$O~tb
zLX<w_IFb?gHM@R$Rk4@FQ9F)#a0J%s%A)dR43VKM4%0Zs;1H;o%A)cGRFN;Aa{rGi
z`sGtRj;V1J;TVlCdXD&^!MLOf9dWQZoyEa4hD$?k3`Pz)02oUXUm!dHYp~6O$2?@n
z{sh|r7(4{#AtPY0;>$t0vXjPBs|q+ZO+UF^I{1ZsP`PmsR22@YAi|PaHRfq4AO|$&
z?RW~nAPknrN(A6JwEu3!i<}#e@wLHK#iKM4183tnmd4*EmGxKcuctKivMy)l+KJKM
z{Ptt-&p!T}PD9tc#T>UIB_t+MjpECS#L@v2(W^AV@X4spsI>Kc>C~|k-@04P{&J^P
z{)VSITV0!@(ai>gR$x!3GEOa*sendin`2G0V#*oE(^EK{399ZR@)I?&A169hPW@6|
zx00g*Oig85vau+)6<AwJZYxfuB0RpN#M3T;2gJNQIwkP5pe@!DyJ@1#kmC%wxkv1|
ziB9#hZo@#W8FWjWKEPbI+)i3nS@WXYdOK;Im0OB?ny3d~8phqjLI`+V;L`%{!O=Dx
z{j3JePR4~nr4VcoLK$H?llc0hJ>pS`_~#vBmyvr5Ich6M6)Sw5qwX1~<%I4Yr}$Hv
zH5XWOrowNDrg_#hC%4Rt?QrSmGnPNfEeoRR_ag#7AcXj2WEf9*gh+!B$qL$)SI9?*
z{D+&^N)wxQvP=`Z;04{LThHs49Quz8)Otd{UZS?jy1&Zwfrgh&3+&lzNlVe)^28C3
zFgb;%GB5Y-r%5RS`q%XLIBLhB+sNyFDAP-T4Z1x}@tq$fO-0eNz}g@npD$UrSj!xU
zV(NFb;HfYPc$7HShC`Xe_cqnU?j2&=O?=8U=Rb1jx6041m312cz@@b;ByEey##zyn
zV$DU-dMj@(h~T5O=voXaF`x=U00#u^pYIa89`citVvi=b8FHH@58TAKn%HCr5I-#`
z9+rqLH}M(M{FhyhT1x2F%Fpf^boULqWrGefxMO%OK>x&Yo3-7No9C0};=palJ~$f#
zoGxLK7qlNdARaShxkNm)lMl0Sizt(ae~!VILFg8|6yeA!q%tw<FqcFKoeeTov@d6M
z_j0<DL%)%y0xdp<C`<Vz!N@kl8j=?qaM&dbW`!Tb;q1+MnKdO^=S9;hdlt&62oY1c
z3{N{bJwRQ_>-rO#K;L<WcubRzONz&Ca$hn<Y?l=GX|lWu&R(}Cqr-SwDQNq@dP}rG
z9o}F~MTnu?oK9Mcaw`OLUR3oQFF2@`{BbC=B0RBa8lhfzKOwz~X6IDeV<Dr9Zce1G
zT=}pkt8N>zb{L&+rVZCk>Tk%}axKVlu7~!Ga^vUes6P}QnHZ;y%=7R&KN6wah9eO^
ze73zEgyD<c=Y11n?fu-4e`5S4-apPxjPVyE{`S$)=-4SfH1V3>7okHFzEj=OdD?io
zv!ja+bTOvxz!`6s(G+0(X9B0YPy4-`&%~LyZe!OF=j||d4l!r^Uz_v|bD_cTNX+l<
zW$8YD)IW-vl?i4>t!h@vI%d=wkUElfO=oppl{8SDs)D`X^33$dk|)EBzVI~A&697v
z{o<u?W2?QN^Syf^+}nIIe7&*NW%NqY$ff3A`A&vAe07$$8((lggU~Oa*PMb_)zyaD
z;0@#00H1-nky3766=&{KcXP*g98!ycoE^T6XSy~$!~-`xIdV&Sc;o{&vA+;c;OVwX
zN0<_znx*$Jz+V$hi!#fNq-|cTx&Y0ES}#-UG_}P^d?%q_;;7XeRX8kSE-xeZA@NWL
zaiVVSu$XH8V1m+Fg9af&DO=#>Kwm4!EdUvPs76QuGcH<hBrVsVama1iq!pS-T7)t{
zK^j&5uZ@JdsTyQy_~F0&Ihdc4&+kdHTv`Jef%<@?a|1xqkY-8dB>_ecP(9FhfsO%5
z=cI9@m)6q$qj98{=6}Pem%zgKK455s88Eyj$%BIvzER&`?<<bMCk6Q5`ul&2hZ`F@
X?;Ae_lSIQ|u2PbtBjXpO8xZ;rwt4T5

literal 0
HcmV?d00001

diff --git a/external/amdgpu_libdevice/oclc_isa_version_950.bc b/external/amdgpu_libdevice/oclc_isa_version_950.bc
new file mode 100644
index 0000000000000000000000000000000000000000..499d64a65d991de7e0d714726ce250b34009ca8e
GIT binary patch
literal 2276
zcma)7eN0=|6~D&d>wxo~>D(5NXZIPCh$>5Y7!vH*)@(nznU$z6-6Se)!q3JIG<-aN
z#DFtRHfDWfvnnU$4@=D?yNUMs4_VQaKe8(3vCK^^MyqK;8b$Vid~7sfD?^o5X*<t^
zuKRD-I_KW=&b{aU&c{7=v9~u)X%MPI2>Db@>)F@d`{~(#Exz1bZ0(Vm>X1Q&(BEYU
zHPjeT4eZeZ;iU1hvsl}!P@61QWV5PcbxN(i_Q$@2^2Hk0l?GRj+N3cnn_j9nwamHs
zWrMNjDIu+PUa1)>DsO0L_8W8b`!Z9@>;REzH1uThGw;-BMf}0j+Lm<sm|AxxT14m)
zjZi(@)e@{#|JpTkN!<@D6I$}P>yI3F-Klf699O>aevR)<q*6$@zVZ$~jKgLj#0#gD
zLX1A(IGPpsHHUtCRk54HF$a!&aTM0->Z1B(43Xg+j?g&H;4r9|>Z1AuRM9V=^8Ak~
z_T^JNj;nDD;W&*icn|r3!Gxp=A9Aocox`CFhD*a93`Pz(02s><Um)BEYp~6W$Gl|N
z@dVpG7(4*xC8J=l>dStnx|7CJs|q+ZO+UF^I{1ZsP<e0&R8<bD5W<pKHRfq4Ap11t
z9e4`BAPknr$^_s!u>Wq=i@XPq@%5oq#iI-n2WJyFp26QDm5o;%Z=^MivMyKl>haOv
z{Pttt&p!T}PD9td#T<1YB_t+UixSI<<kCJA(Q7oq@X4gls<e%Lne>t4-@H@H{*r%Q
z{v&T~uC_i;qZ>^Kt-zj6WrA9+P(h8%KF6A8#k4Ddr>Afv8&cgv<R@xkFF|ywT>7Pg
zZY56znY!w>RC7sgFS7Qu++LbWM|pfviKpEH4~ThrY)as1L0f7hb}~eTA;%eVbC=ll
z5S{8}-G+f$Gw7B$eUQ0iy_K@AvX(`;?N-V*E4P+*H8C%~IE=f8g)s1hz-I*Bi(_p#
z_E{a6olFRWN+HxFgtNkQHu<$jyTqe1@z2}D4kPy!^VC+JDpmP9N8L40%Sqi`PVuJ<
zYbmmpY?a>>&GW2zPHvqS+u_pBXRLpeTNgyt??(iFKnU}x=rEr03ehGZniI4uuab`t
z`411Vl_54AWQ8Vnzze!fw_eaMIrSeIsP&|Ny-aOYbbpoUgH5lP7uYjbQ`VBF^@$^1
zVR8yjWnbyr%aBq8^snpha?}HZZlj?4flMy}Ht2Ra#kYT$GM7Z_0&9nWe7<DcVy$x^
zis|3ggQvnI;8Egu8xCia-`!LbJGY5#5Ai8uEPUkDZ&jXMtLQcWfLm)_NZA)t&9kC8
z&00#L?PkGJ6v0PZ$-Nj-Vn7vw01gP+Ki?sCyyPck#V$>5Gvqc+?t6%FHL=MMAbwg_
zJS-Di9^x~`_?I1yT1x8HD$nj1boUIpWrGef_`vX7kp7AF7HhvLx6G$3rGZ<JeQ-7o
zINib|FK9owPdsMGN||`*ARp%77EvaT{2YTXgU~H_DZ<fJNM&-?X(@{kIy+>l<XFz>
z?&fu6r+%YA1zY_LQIYaXf{|^8HKZ;y;fPxp%n9F5z}Xw~GFw`-&5Pz$_6(F$2_mL)
z8=iJ?dVso8(Df%Z!M<}0@t7tbmlcma<ep@T*e)yX(PU*6oW15q#fI^;QqcB)^`>Zr
zI=s%BOAtf3C6lt1<TeQAyr}9qT69t?g`-erC3s@fG(x@be!_Y;&CaQ`N5UpIZA_*w
zU;eNsr*0dvb(malW(?QN>aWY%@~y~uwuknQa^vUdSRfo3nHZ-{%nR^4Hxi}WhNDqF
za;Ci<gy9Rm7yJ`r?fu+PU}F4bJ}}NsjPVzuf%ehS*w{%vJn?$KAEm<+{*&F(dD?WU
zv!ja+b}{De;Avl%$sA+?r-P@uPX&CO-^`i0Zd2C~=j$+a4l$<#&rkY?x$t0QBp&ee
zvUFb{78pfFWs;dutBgun$BbG7Qb)?6>8vfNQU<D1Rdf{H-kJV*>O`d3ADITaapJAF
zU%D7+KIZ7>{O_KR^cqh@t~DQXn|zWqa?$uJ|A|P4zrp%;^NXHm5c&o5nv)Q#hWc<D
zykQ(4;ImLSQpzo>;><njPX6cvr_^E~X9sWNnXXMQao+<^j@*(S9(mtG>@6gcc)G3H
z5vByFX6Zc)@K;6iqRe_dWuF&o&O>vd)+^LHO>J=!e~{EKanx#_DjpOuSCEl=ka(zr
z1ko^eP)v1yFhl9AL4y#Xl&$b`psy9>R)CB?R3oH-8JBF=Q`W1{IOO(R$_7m&BSIOV
z5RIz;*Cs;UQVX&)0`On{9L!J2=XWJpF0Fw~Kz%^cxd9+)NV6pKk^rLzs1fKpKu3V2
zbJ94}OKWNW;W*Sw^S@y<N?>7p4=^;t3>e;%<iWuS|EPbk_f_ZMlLGv2{r$hi!;KA{
X^N*i|NuuEhS1rlWk@54=4G8@Q(#Y={

literal 0
HcmV?d00001


From 48cc4f7170b33b7e1ae96ed8fd2c2a2079bd9ecf Mon Sep 17 00:00:00 2001
From: Tiffany Mintz <Tiffany.Mintz@amd.com>
Date: Fri, 26 Sep 2025 13:09:14 -0400
Subject: [PATCH 29/42] adding patch for changes to external spdlog

---
 spdlog_fmt.patch | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 spdlog_fmt.patch

diff --git a/spdlog_fmt.patch b/spdlog_fmt.patch
new file mode 100644
index 0000000000000..c451134aa2e38
--- /dev/null
+++ b/spdlog_fmt.patch
@@ -0,0 +1,41 @@
+diff --git a/include/spdlog/fmt/bundled/format.h b/include/spdlog/fmt/bundled/format.h
+index 01f41f5..8785c75 100644
+--- a/include/spdlog/fmt/bundled/format.h
++++ b/include/spdlog/fmt/bundled/format.h
+@@ -555,7 +555,7 @@ class u8string_view : public basic_string_view<char8_t> {
+ 
+ #if FMT_USE_USER_DEFINED_LITERALS
+ inline namespace literals {
+-inline u8string_view operator"" _u(const char* s, std::size_t n) {
++inline u8string_view operator""_u(const char* s, std::size_t n) {
+   return {s, n};
+ }
+ }  // namespace literals
+@@ -3467,11 +3467,11 @@ FMT_CONSTEXPR internal::udl_formatter<Char, CHARS...> operator""_format() {
+     std::string message = "The answer is {}"_format(42);
+   \endrst
+  */
+-FMT_CONSTEXPR internal::udl_formatter<char> operator"" _format(const char* s,
++FMT_CONSTEXPR internal::udl_formatter<char> operator""_format(const char* s,
+                                                                std::size_t n) {
+   return {{s, n}};
+ }
+-FMT_CONSTEXPR internal::udl_formatter<wchar_t> operator"" _format(
++FMT_CONSTEXPR internal::udl_formatter<wchar_t> operator""_format(
+     const wchar_t* s, std::size_t n) {
+   return {{s, n}};
+ }
+@@ -3487,11 +3487,11 @@ FMT_CONSTEXPR internal::udl_formatter<wchar_t> operator"" _format(
+     fmt::print("Elapsed time: {s:.2f} seconds", "s"_a=1.23);
+   \endrst
+  */
+-FMT_CONSTEXPR internal::udl_arg<char> operator"" _a(const char* s,
++FMT_CONSTEXPR internal::udl_arg<char> operator""_a(const char* s,
+                                                     std::size_t n) {
+   return {{s, n}};
+ }
+-FMT_CONSTEXPR internal::udl_arg<wchar_t> operator"" _a(const wchar_t* s,
++FMT_CONSTEXPR internal::udl_arg<wchar_t> operator""_a(const wchar_t* s,
+                                                        std::size_t n) {
+   return {{s, n}};
+ }

From ed925e62f7c9357b12231597adf8eacd35313d88 Mon Sep 17 00:00:00 2001
From: Tiffany Mintz <Tiffany.Mintz@amd.com>
Date: Thu, 30 Oct 2025 09:12:03 -0500
Subject: [PATCH 30/42] updating dockerfile for llvm 20

---
 Dockerfile.rocm | 38 +++++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 5678a815a3e0b..d894a27cb481c 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -1,11 +1,12 @@
 # ---------------------------------------------
 # Stage 1: Build Taichi and generate artifacts
 # ---------------------------------------------
-ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.2
+ARG BASE_IMAGE=rocm/dev-ubuntu-24.04:7.0-complete
 FROM ${BASE_IMAGE} as taichi_build
 
-ARG LLVM_VERSION=15
-ARG GPU_TARGETS=gfx90a,gfx942
+ARG LLVM_VERSION=20
+ARG GPU_TARGETS=gfx90a,gfx940,gfx941,gfx942,gfx950
+ARG ROCM_VERSION=7.0.0
 ARG PROJECT_NAME
 ARG RELEASE_VERSION
 ARG TI_VERSION_MAJOR
@@ -24,26 +25,35 @@ ENV SRC_DIR=/src
 ENV TAICHI_SRC=${SRC_DIR}/taichi
 ENV LLVM_DIR=/usr/lib/llvm-${LLVM_VERSION}
 ENV PATH=${LLVM_DIR}/bin:$PATH
-ENV TAICHI_CMAKE_ARGS="-DTI_WITH_VULKAN=OFF -DTI_WITH_OPENGL=OFF -DTI_BUILD_TESTS=ON -DTI_BUILD_EXAMPLES=OFF -DCMAKE_PREFIX_PATH=${LLVM_DIR}/lib/cmake -DCMAKE_CXX_COMPILER=${LLVM_DIR}/bin/clang++ -DTI_WITH_AMDGPU=ON -DTI_WITH_CUDA=OFF -DTI_AMDGPU_ARCHS=${GPU_TARGETS}"
+ENV ROCM_PATH=/opt/rocm-${ROCM_VERSION}
+ENV TAICHI_CMAKE_ARGS="-DTI_WITH_VULKAN=OFF -DTI_WITH_OPENGL=OFF -DTI_BUILD_TESTS=OFF -DTI_BUILD_EXAMPLES=OFF -DCMAKE_PREFIX_PATH=${LLVM_DIR}/lib/cmake -DCMAKE_CXX_COMPILER=${LLVM_DIR}/bin/clang++ -DTI_WITH_AMDGPU=ON -DTI_WITH_CUDA=OFF -DTI_AMDGPU_ARCHS=${GPU_TARGETS}"
 
 # Install build dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
-    git wget \
-    freeglut3-dev libglfw3-dev libglm-dev libglu1-mesa-dev \
-    libjpeg-dev liblz4-dev libpng-dev libssl-dev \
-    libwayland-dev libx11-xcb-dev libxcb-dri3-dev libxcb-ewmh-dev \
-    libxcb-keysyms1-dev libxcb-randr0-dev libxcursor-dev libxi-dev \
-    libxinerama-dev libxrandr-dev libzstd-dev \
-    python3-pip cmake pybind11-dev ca-certificates \
-    llvm-${LLVM_VERSION} clang-${LLVM_VERSION} lld-${LLVM_VERSION} \
+	git wget vim git freeglut3-dev libglfw3-dev libglm-dev \
+	libglu1-mesa-dev libjpeg-dev liblz4-dev libpng-dev \
+	libssl-dev libwayland-dev libx11-xcb-dev libxcb-dri3-dev \
+	libxcb-ewmh-dev libxcb-keysyms1-dev libxcb-randr0-dev \
+	libxcursor-dev libxi-dev libxinerama-dev libxrandr-dev \
+	libzstd-dev python3-pip cmake pybind11-dev \
+	ca-certificates python3-venv rocm-llvm-dev \
  && apt-get clean && rm -rf /var/lib/apt/lists/*
 
+RUN wget https://apt.llvm.org/llvm.sh \
+	&& chmod +x llvm.sh \
+	&& apt-get update && apt-get install -y \
+	lsb-release software-properties-common gnupg \
+	&& ./llvm.sh ${LLVM_VERSION} llvm clang lld 
+
 # Copy source code
 COPY . ${SRC_DIR}
 RUN git config --global --add safe.directory "${TAICHI_SRC}"
 
 # Build Taichi and generate artifacts
 RUN cd ${TAICHI_SRC} && \
+    cd external/spdlog && \
+    git apply ../../spdlog_fmt.patch && \
+    cd ../.. && \
     ./build.py && \
     mkdir -p /tmp/artifacts && \
     cp dist/amd_taichi*.whl /tmp/artifacts/ && \
@@ -54,9 +64,11 @@ RUN cd ${TAICHI_SRC} && \
 # ---------------------------------------------
 FROM ${BASE_IMAGE} as taichi_final
 
-ARG LLVM_VERSION=15
+ARG LLVM_VERSION=20
+ARG ROCM_VERSION=7.0.0
 ENV DEBIAN_FRONTEND=noninteractive
 ENV LLVM_DIR=/usr/lib/llvm-${LLVM_VERSION}
+ENV ROCM_PATH=/opt/rocm-${ROCM_VERSION}
 ENV PATH=${LLVM_DIR}/bin:$PATH
 
 # Install runtime dependencies

From a78aacaecf61efdcf98057de3bc25549ef61fd48 Mon Sep 17 00:00:00 2001
From: Deepan Sekar <deepan.sekar@outlook.com>
Date: Tue, 4 Nov 2025 16:50:27 -0500
Subject: [PATCH 31/42] Update Dockerfile to fix pipeline issues

---
 Dockerfile.rocm | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index d894a27cb481c..c3f36a469a784 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -23,10 +23,10 @@ ENV TI_VERSION_PATCH=${TI_VERSION_PATCH}
 ENV DEBIAN_FRONTEND=noninteractive
 ENV SRC_DIR=/src
 ENV TAICHI_SRC=${SRC_DIR}/taichi
-ENV LLVM_DIR=/usr/lib/llvm-${LLVM_VERSION}
-ENV PATH=${LLVM_DIR}/bin:$PATH
+ENV LLVM_PATH=/usr/lib/llvm-${LLVM_VERSION}
+ENV PATH=${LLVM_PATH}/bin:$PATH
 ENV ROCM_PATH=/opt/rocm-${ROCM_VERSION}
-ENV TAICHI_CMAKE_ARGS="-DTI_WITH_VULKAN=OFF -DTI_WITH_OPENGL=OFF -DTI_BUILD_TESTS=OFF -DTI_BUILD_EXAMPLES=OFF -DCMAKE_PREFIX_PATH=${LLVM_DIR}/lib/cmake -DCMAKE_CXX_COMPILER=${LLVM_DIR}/bin/clang++ -DTI_WITH_AMDGPU=ON -DTI_WITH_CUDA=OFF -DTI_AMDGPU_ARCHS=${GPU_TARGETS}"
+ENV TAICHI_CMAKE_ARGS="-DTI_WITH_VULKAN=OFF -DTI_WITH_OPENGL=OFF -DTI_BUILD_TESTS=OFF -DTI_BUILD_EXAMPLES=OFF -DCMAKE_PREFIX_PATH=${LLVM_PATH}/lib/cmake -DCMAKE_CXX_COMPILER=${LLVM_PATH}/bin/clang++ -DTI_WITH_AMDGPU=ON -DTI_WITH_CUDA=OFF -DTI_AMDGPU_ARCHS=${GPU_TARGETS}"
 
 # Install build dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -56,7 +56,7 @@ RUN cd ${TAICHI_SRC} && \
     cd ../.. && \
     ./build.py && \
     mkdir -p /tmp/artifacts && \
-    cp dist/amd_taichi*.whl /tmp/artifacts/ && \
+    cp dist/taichi*.whl /tmp/artifacts/ && \
     tar -czvf /tmp/artifacts/taichi-tests.tar.gz -C /src taichi
 
 # ---------------------------------------------
@@ -67,9 +67,9 @@ FROM ${BASE_IMAGE} as taichi_final
 ARG LLVM_VERSION=20
 ARG ROCM_VERSION=7.0.0
 ENV DEBIAN_FRONTEND=noninteractive
-ENV LLVM_DIR=/usr/lib/llvm-${LLVM_VERSION}
+ENV LLVM_PATH=/usr/lib/llvm-${LLVM_VERSION}
 ENV ROCM_PATH=/opt/rocm-${ROCM_VERSION}
-ENV PATH=${LLVM_DIR}/bin:$PATH
+ENV PATH=${LLVM_PATH}/bin:$PATH
 
 # Install runtime dependencies
 RUN apt-get update && apt-get install -y \
@@ -77,10 +77,10 @@ RUN apt-get update && apt-get install -y \
     apt-get clean && rm -rf /var/lib/apt/lists/*
 
 # Copy and install Taichi wheel
-COPY --from=taichi_build /tmp/artifacts/amd_taichi*.whl /
+COPY --from=taichi_build /tmp/artifacts/taichi*.whl /
 RUN pip3 install --no-cache-dir --upgrade pip setuptools wheel && \
-    pip3 install /amd_taichi*.whl && \
-    rm /amd_taichi*.whl
+    pip3 install /taichi*.whl && \
+    rm /taichi*.whl
 
 # ---------------------------------------------
 # Stage 3: Export raw artifacts to host

From 2549e394fd13ba9380b21f0ebaf9fc1f4a203411 Mon Sep 17 00:00:00 2001
From: Deepan Sekar <deepan.sekar@outlook.com>
Date: Wed, 5 Nov 2025 08:32:59 -0500
Subject: [PATCH 32/42] dockerfile copy dir

---
 Dockerfile.rocm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index c3f36a469a784..c8386a7bf9f7f 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -46,7 +46,7 @@ RUN wget https://apt.llvm.org/llvm.sh \
 	&& ./llvm.sh ${LLVM_VERSION} llvm clang lld 
 
 # Copy source code
-COPY . ${SRC_DIR}
+COPY . ${TAICHI_SRC}
 RUN git config --global --add safe.directory "${TAICHI_SRC}"
 
 # Build Taichi and generate artifacts

From 300196b55da5c3e6812238f175e1d77f6525379b Mon Sep 17 00:00:00 2001
From: Deepan Sekar <deepan.sekar@outlook.com>
Date: Wed, 5 Nov 2025 09:46:01 -0500
Subject: [PATCH 33/42] Dockerfile reformat

---
 Dockerfile.rocm | 53 +++++++++++++++++++++++++++++++------------------
 1 file changed, 34 insertions(+), 19 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index c8386a7bf9f7f..256a84e431b9b 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -19,14 +19,12 @@ ENV TI_VERSION_MAJOR=${TI_VERSION_MAJOR}
 ENV TI_VERSION_MINOR=${TI_VERSION_MINOR}
 ENV TI_VERSION_PATCH=${TI_VERSION_PATCH}
 
-
 ENV DEBIAN_FRONTEND=noninteractive
-ENV SRC_DIR=/src
-ENV TAICHI_SRC=${SRC_DIR}/taichi
 ENV LLVM_PATH=/usr/lib/llvm-${LLVM_VERSION}
 ENV PATH=${LLVM_PATH}/bin:$PATH
 ENV ROCM_PATH=/opt/rocm-${ROCM_VERSION}
-ENV TAICHI_CMAKE_ARGS="-DTI_WITH_VULKAN=OFF -DTI_WITH_OPENGL=OFF -DTI_BUILD_TESTS=OFF -DTI_BUILD_EXAMPLES=OFF -DCMAKE_PREFIX_PATH=${LLVM_PATH}/lib/cmake -DCMAKE_CXX_COMPILER=${LLVM_PATH}/bin/clang++ -DTI_WITH_AMDGPU=ON -DTI_WITH_CUDA=OFF -DTI_AMDGPU_ARCHS=${GPU_TARGETS}"
+
+ENV TAICHI_CMAKE_ARGS="-DTI_WITH_VULKAN=OFF -DTI_WITH_OPENGL=OFF -DTI_BUILD_TESTS=OFF -DTI_BUILD_EXAMPLES=OFF -DCMAKE_PREFIX_PATH=${LLVM_PATH}/lib/cmake -DCMAKE_CXX_COMPILER=${LLVM_PATH}/bin/clang++ -DTI_WITH_AMDGPU=ON -DTI_WITH_CUDA=OFF -DTI_AMDGPU_ARCHS=${GPU_TARGETS} -DUSE_LLD=ON -DTI_WITH_LLVM=ON"
 
 # Install build dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -37,7 +35,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 	libxcursor-dev libxi-dev libxinerama-dev libxrandr-dev \
 	libzstd-dev python3-pip cmake pybind11-dev \
 	ca-certificates python3-venv rocm-llvm-dev \
- && apt-get clean && rm -rf /var/lib/apt/lists/*
+    gdb python3-dbg 
+
+WORKDIR /app
 
 RUN wget https://apt.llvm.org/llvm.sh \
 	&& chmod +x llvm.sh \
@@ -46,18 +46,17 @@ RUN wget https://apt.llvm.org/llvm.sh \
 	&& ./llvm.sh ${LLVM_VERSION} llvm clang lld 
 
 # Copy source code
-COPY . ${TAICHI_SRC}
-RUN git config --global --add safe.directory "${TAICHI_SRC}"
+COPY . .
+# RUN git config --global --add safe.directory "${TAICHI_SRC}"
 
 # Build Taichi and generate artifacts
-RUN cd ${TAICHI_SRC} && \
-    cd external/spdlog && \
-    git apply ../../spdlog_fmt.patch && \
-    cd ../.. && \
+RUN cd external/spdlog && \
+    git apply /app/spdlog_fmt.patch && \
+    cd /app && \
     ./build.py && \
     mkdir -p /tmp/artifacts && \
     cp dist/taichi*.whl /tmp/artifacts/ && \
-    tar -czvf /tmp/artifacts/taichi-tests.tar.gz -C /src taichi
+    tar -czvf /tmp/artifacts/taichi-tests.tar.gz -C /app taichi
 
 # ---------------------------------------------
 # Stage 2: Create runtime image with Taichi installed
@@ -72,15 +71,31 @@ ENV ROCM_PATH=/opt/rocm-${ROCM_VERSION}
 ENV PATH=${LLVM_PATH}/bin:$PATH
 
 # Install runtime dependencies
-RUN apt-get update && apt-get install -y \
-    python3-pip lld-${LLVM_VERSION} && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN apt-get update && apt-get install -y --no-install-recommends \
+	git wget vim git freeglut3-dev libglfw3-dev libglm-dev \
+	libglu1-mesa-dev libjpeg-dev liblz4-dev libpng-dev \
+	libssl-dev libwayland-dev libx11-xcb-dev libxcb-dri3-dev \
+	libxcb-ewmh-dev libxcb-keysyms1-dev libxcb-randr0-dev \
+	libxcursor-dev libxi-dev libxinerama-dev libxrandr-dev \
+	libzstd-dev python3-pip cmake pybind11-dev \
+	ca-certificates python3-venv rocm-llvm-dev \
+    gdb python3-dbg 
+
+WORKDIR /app
+
+RUN wget https://apt.llvm.org/llvm.sh \
+	&& chmod +x llvm.sh \
+	&& apt-get update && apt-get install -y \
+	lsb-release software-properties-common gnupg \
+	&& ./llvm.sh ${LLVM_VERSION} llvm clang lld \
+    && rm -rf llvm.sh
 
 # Copy and install Taichi wheel
-COPY --from=taichi_build /tmp/artifacts/taichi*.whl /
-RUN pip3 install --no-cache-dir --upgrade pip setuptools wheel && \
-    pip3 install /taichi*.whl && \
-    rm /taichi*.whl
+COPY --from=taichi_build /tmp/artifacts/taichi*.whl /app/
+
+RUN python3 -m pip config set global.break-system-packages true \
+    && python3 -m pip install /app/taichi*.whl \
+    && rm -rf /app/taichi*.whl
 
 # ---------------------------------------------
 # Stage 3: Export raw artifacts to host

From 8dab1711255d83f8af940f456e657f061b8ba71d Mon Sep 17 00:00:00 2001
From: Deepan Sekar <deepan.sekar@outlook.com>
Date: Fri, 7 Nov 2025 16:39:05 -0500
Subject: [PATCH 34/42] CI: Fix Dockerfile issues

---
 Dockerfile.rocm | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 256a84e431b9b..ad3b8334d81f4 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -49,13 +49,23 @@ RUN wget https://apt.llvm.org/llvm.sh \
 COPY . .
 # RUN git config --global --add safe.directory "${TAICHI_SRC}"
 
+# Install Python build dependencies
+RUN python3 -m pip config set global.break-system-packages true && \
+    python3 -m pip install -r requirements_dev.txt
+
 # Build Taichi and generate artifacts
 RUN cd external/spdlog && \
     git apply /app/spdlog_fmt.patch && \
     cd /app && \
     ./build.py && \
+    echo "=== Listing dist directory ===" && \
+    ls -la dist/ && \
+    echo "=== Creating artifacts directory ===" && \
     mkdir -p /tmp/artifacts && \
-    cp dist/taichi*.whl /tmp/artifacts/ && \
+    echo "=== Copying wheel files ===" && \
+    cp dist/*.whl /tmp/artifacts/ && \
+    echo "=== Listing artifacts directory ===" && \
+    ls -la /tmp/artifacts/ && \
     tar -czvf /tmp/artifacts/taichi-tests.tar.gz -C /app taichi
 
 # ---------------------------------------------
@@ -91,11 +101,12 @@ RUN wget https://apt.llvm.org/llvm.sh \
     && rm -rf llvm.sh
 
 # Copy and install Taichi wheel
-COPY --from=taichi_build /tmp/artifacts/taichi*.whl /app/
+COPY --from=taichi_build /tmp/artifacts/*.whl /app/
 
 RUN python3 -m pip config set global.break-system-packages true \
-    && python3 -m pip install /app/taichi*.whl \
-    && rm -rf /app/taichi*.whl
+    && ls -la /app/ \
+    && for whl in /app/*.whl; do python3 -m pip install "$whl"; done \
+    && rm -rf /app/*.whl
 
 # ---------------------------------------------
 # Stage 3: Export raw artifacts to host

From ed1c61d5e4f397645a28581d4fa17f5f1e10a179 Mon Sep 17 00:00:00 2001
From: Deepan Sekar <deepan.sekar@outlook.com>
Date: Sun, 9 Nov 2025 21:25:31 -0500
Subject: [PATCH 35/42] Fix Tester Issues

---
 Dockerfile.rocm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index ad3b8334d81f4..228c6ae69db20 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -66,7 +66,7 @@ RUN cd external/spdlog && \
     cp dist/*.whl /tmp/artifacts/ && \
     echo "=== Listing artifacts directory ===" && \
     ls -la /tmp/artifacts/ && \
-    tar -czvf /tmp/artifacts/taichi-tests.tar.gz -C /app taichi
+    tar -czvf /tmp/artifacts/taichi-tests.tar.gz -C /app --transform 's,^,taichi/,' --exclude='external' --exclude='_skbuild' .
 
 # ---------------------------------------------
 # Stage 2: Create runtime image with Taichi installed

From 29c412942dde7358b72070c3054a6065052bc7c5 Mon Sep 17 00:00:00 2001
From: tmm77 <6461957+tmm77@users.noreply.github.com>
Date: Fri, 14 Nov 2025 12:01:23 -0500
Subject: [PATCH 36/42] removing any existing build cache

---
 Dockerfile.rocm | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 228c6ae69db20..62ca57835f091 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -57,6 +57,7 @@ RUN python3 -m pip config set global.break-system-packages true && \
 RUN cd external/spdlog && \
     git apply /app/spdlog_fmt.patch && \
     cd /app && \
+    rm -rf /root/.cache/* && \
     ./build.py && \
     echo "=== Listing dist directory ===" && \
     ls -la dist/ && \

From 7b155bb2b6e5b74ef915180ca97db7bb1f8f5e07 Mon Sep 17 00:00:00 2001
From: Deepan Sekar <deepan.sekar@outlook.com>
Date: Fri, 14 Nov 2025 14:23:21 -0500
Subject: [PATCH 37/42] Fix Version Issues

---
 setup.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/setup.py b/setup.py
index 45166bbf20d23..f8d09f40e5431 100644
--- a/setup.py
+++ b/setup.py
@@ -9,6 +9,7 @@
 import multiprocessing
 import os
 import platform
+import re
 import shutil
 import subprocess
 import sys
@@ -48,6 +49,8 @@ def get_version():
 project_name = os.getenv("PROJECT_NAME", "taichi")
 version = get_version()
 TI_VERSION_MAJOR, TI_VERSION_MINOR, TI_VERSION_PATCH = version.split(".")
+# Strip non-numeric suffix from patch version (e.g., "0b2" -> "0") for AMD compatibility versioning
+TI_VERSION_PATCH = re.sub(r'[^0-9].*$', '', TI_VERSION_PATCH)
 
 data_files = glob.glob("python/_lib/runtime/*")
 print(data_files)

From 13a055027d08169f52814afc4a8b3484cd071b6a Mon Sep 17 00:00:00 2001
From: "Sankar, Anisha" <Anisha.Sankar@amd.com>
Date: Tue, 9 Dec 2025 10:38:59 -0500
Subject: [PATCH 38/42] Docs: Taichi component, configs and setup for 25.11
 release (#2)

---
 .wordlist.txt                                 | 865 ++++++++++++++++++
 CMakeLists.txt                                |   2 +
 README.md                                     |   8 +-
 docs/about/license.rst                        |  12 +
 docs/conf.py                                  |  58 ++
 docs/examples/taichi-examples.rst             | 140 +++
 docs/examples/taichi-get-started.rst          |  64 ++
 docs/index.rst                                |  39 +
 docs/install/taichi-install.rst               | 289 ++++++
 docs/{ => source}/conftest.py                 |   0
 docs/{ => source}/cover-in-ci.lst             |   0
 .../design/llvm_sparse_runtime.md             |   0
 docs/{ => source}/fragments/.gitkeep          |   0
 .../lang/articles/about/_category_.json       |   0
 .../lang/articles/about/overview.md           |   0
 .../lang/articles/advanced/_category_.json    |   0
 .../lang/articles/advanced/argument_pack.md   |   0
 .../articles/advanced/data_oriented_class.md  |   0
 .../lang/articles/advanced/dataclass.md       |   0
 .../lang/articles/advanced/meta.md            |   0
 .../lang/articles/advanced/odop.md            |   0
 .../lang/articles/advanced/quant.md           |   0
 .../lang/articles/basic/_category_.json       |   0
 .../lang/articles/basic/external.md           |   0
 .../{ => source}/lang/articles/basic/field.md |   0
 .../lang/articles/basic/layout.md             |   0
 .../lang/articles/basic/ndarray.md            |   0
 .../lang/articles/basic/offset.md             |   0
 .../lang/articles/basic/sparse.md             |   0
 .../lang/articles/c-api/_category_.json       |   0
 .../lang/articles/c-api/taichi_core.md        |   0
 .../lang/articles/c-api/taichi_vulkan.md      |   0
 .../articles/contribution/_category_.json     |   0
 .../contribution/contributor_guide.md         |   0
 .../lang/articles/contribution/dev_install.md |   0
 .../contribution/developer_utilities.md       |   0
 .../articles/contribution/development_tips.md |   0
 .../lang/articles/contribution/doc_writing.md |   0
 .../articles/contribution/style_guide_en.md   |   0
 .../articles/contribution/windows_debug.md    |   0
 .../lang/articles/contribution/write_test.md  |   0
 .../contribution/writing_cpp_tests.md         |   0
 .../lang/articles/debug/_category_.json       |   0
 .../lang/articles/debug/debugging.md          |   0
 .../lang/articles/deployment/_category_.json  |   0
 .../lang/articles/deployment/tutorial.md      |   0
 .../articles/differentiable/_category_.json   |   0
 .../differentiable_programming.md             |   0
 .../lang/articles/faqs/_category_.json        |   0
 docs/{ => source}/lang/articles/faqs/faq.md   |   0
 .../lang/articles/faqs/install.md             |   0
 .../lang/articles/get-started/_category_.json |   0
 .../articles/get-started/accelerate_python.md |   0
 .../get-started/accelerate_pytorch.md         |   0
 .../articles/get-started/cloth_simulation.md  |   0
 .../lang/articles/get-started/hello_world.md  |   0
 .../lang/articles/glossary/_category_.json    |   0
 .../lang/articles/glossary/glossary.md        |   0
 .../lang/articles/internals/_category_.json   |   0
 .../lang/articles/internals/compilation.md    |   0
 .../lang/articles/internals/internal.md       |   0
 .../internals/life_of_kernel_lowres.jpg       | Bin
 .../lang/articles/kernels/_category_.json     |   0
 .../lang/articles/kernels/kernel_function.md  |   0
 .../lang/articles/kernels/kernel_sync.md      |   0
 .../lang/articles/math/_category_.json        |   0
 .../lang/articles/math/linear_solver.md       |   0
 .../lang/articles/math/math_module.md         |   0
 .../lang/articles/math/sparse_matrix.md       |   0
 .../performance_tuning/_category_.json        |   0
 .../performance_tuning/performance.md         |   0
 .../articles/performance_tuning/profiler.md   |   0
 .../lang/articles/reference/_category_.json   |   0
 ...nces_between_taichi_and_python_programs.md |   0
 .../articles/reference/global_settings.md     |   0
 .../articles/reference/language_reference.md  |   0
 .../lang/articles/reference/operator.md       |   0
 .../lang/articles/reference/simt.md           |   0
 .../lang/articles/reference/syntax_sugars.md  |   0
 .../articles/static/assets/aot_tutorial.png   | Bin
 .../articles/static/assets/arrow_field.png    | Bin
 .../lang/articles/static/assets/arrows.png    | Bin
 .../bitpacked_fields_layout_example.png       | Bin
 .../static/assets/bls_indices_mapping.png     | Bin
 .../lang/articles/static/assets/circles.png   | Bin
 .../static/assets/colored_circles.png         | Bin
 .../static/assets/effect_of_offline_cache.png | Bin
 .../static/assets/floating-point_formats.png  | Bin
 .../lang/articles/static/assets/fractal.png   | Bin
 .../lang/articles/static/assets/lines.png     | Bin
 .../articles/static/assets/point_field.png    | Bin
 .../assets/quant_array_layout_example.png     | Bin
 .../lang/articles/static/assets/rect.png      | Bin
 .../lang/articles/static/assets/runtime.png   | Bin
 .../lang/articles/static/assets/triangles.png | Bin
 .../lang/articles/type_system/_category_.json |   0
 .../lang/articles/type_system/type.md         |   0
 .../lang/articles/utilities/_category_.json   |   0
 .../articles/visualization/_category_.json    |   0
 .../articles/visualization/export_results.md  |   0
 .../lang/articles/visualization/ggui.md       |   0
 .../lang/articles/visualization/gui_system.md |   0
 .../{ => source}/rfcs/20220410-rfc-process.md |   0
 .../rfcs/20220413-aot-for-all-snode.md        |   0
 docs/{ => source}/rfcs/res/.gitkeep           |   0
 .../rfcs/yyyymmdd-rfc-template.md             |   0
 docs/{ => source}/variable.json               |   0
 docs/sphinx/_toc.yml.in                       |  29 +
 docs/sphinx/requirements.in                   |   1 +
 docs/sphinx/requirements.txt                  | 276 ++++++
 docs/what-is-taichi.rst                       |  55 ++
 readthedocs.yaml                              |  14 +
 112 files changed, 1848 insertions(+), 4 deletions(-)
 create mode 100644 .wordlist.txt
 create mode 100644 docs/about/license.rst
 create mode 100644 docs/conf.py
 create mode 100644 docs/examples/taichi-examples.rst
 create mode 100644 docs/examples/taichi-get-started.rst
 create mode 100644 docs/index.rst
 create mode 100644 docs/install/taichi-install.rst
 rename docs/{ => source}/conftest.py (100%)
 rename docs/{ => source}/cover-in-ci.lst (100%)
 rename docs/{ => source}/design/llvm_sparse_runtime.md (100%)
 rename docs/{ => source}/fragments/.gitkeep (100%)
 rename docs/{ => source}/lang/articles/about/_category_.json (100%)
 rename docs/{ => source}/lang/articles/about/overview.md (100%)
 rename docs/{ => source}/lang/articles/advanced/_category_.json (100%)
 rename docs/{ => source}/lang/articles/advanced/argument_pack.md (100%)
 rename docs/{ => source}/lang/articles/advanced/data_oriented_class.md (100%)
 rename docs/{ => source}/lang/articles/advanced/dataclass.md (100%)
 rename docs/{ => source}/lang/articles/advanced/meta.md (100%)
 rename docs/{ => source}/lang/articles/advanced/odop.md (100%)
 rename docs/{ => source}/lang/articles/advanced/quant.md (100%)
 rename docs/{ => source}/lang/articles/basic/_category_.json (100%)
 rename docs/{ => source}/lang/articles/basic/external.md (100%)
 rename docs/{ => source}/lang/articles/basic/field.md (100%)
 rename docs/{ => source}/lang/articles/basic/layout.md (100%)
 rename docs/{ => source}/lang/articles/basic/ndarray.md (100%)
 rename docs/{ => source}/lang/articles/basic/offset.md (100%)
 rename docs/{ => source}/lang/articles/basic/sparse.md (100%)
 rename docs/{ => source}/lang/articles/c-api/_category_.json (100%)
 rename docs/{ => source}/lang/articles/c-api/taichi_core.md (100%)
 rename docs/{ => source}/lang/articles/c-api/taichi_vulkan.md (100%)
 rename docs/{ => source}/lang/articles/contribution/_category_.json (100%)
 rename docs/{ => source}/lang/articles/contribution/contributor_guide.md (100%)
 rename docs/{ => source}/lang/articles/contribution/dev_install.md (100%)
 rename docs/{ => source}/lang/articles/contribution/developer_utilities.md (100%)
 rename docs/{ => source}/lang/articles/contribution/development_tips.md (100%)
 rename docs/{ => source}/lang/articles/contribution/doc_writing.md (100%)
 rename docs/{ => source}/lang/articles/contribution/style_guide_en.md (100%)
 rename docs/{ => source}/lang/articles/contribution/windows_debug.md (100%)
 rename docs/{ => source}/lang/articles/contribution/write_test.md (100%)
 rename docs/{ => source}/lang/articles/contribution/writing_cpp_tests.md (100%)
 rename docs/{ => source}/lang/articles/debug/_category_.json (100%)
 rename docs/{ => source}/lang/articles/debug/debugging.md (100%)
 rename docs/{ => source}/lang/articles/deployment/_category_.json (100%)
 rename docs/{ => source}/lang/articles/deployment/tutorial.md (100%)
 rename docs/{ => source}/lang/articles/differentiable/_category_.json (100%)
 rename docs/{ => source}/lang/articles/differentiable/differentiable_programming.md (100%)
 rename docs/{ => source}/lang/articles/faqs/_category_.json (100%)
 rename docs/{ => source}/lang/articles/faqs/faq.md (100%)
 mode change 100755 => 100644
 rename docs/{ => source}/lang/articles/faqs/install.md (100%)
 rename docs/{ => source}/lang/articles/get-started/_category_.json (100%)
 rename docs/{ => source}/lang/articles/get-started/accelerate_python.md (100%)
 rename docs/{ => source}/lang/articles/get-started/accelerate_pytorch.md (100%)
 rename docs/{ => source}/lang/articles/get-started/cloth_simulation.md (100%)
 rename docs/{ => source}/lang/articles/get-started/hello_world.md (100%)
 rename docs/{ => source}/lang/articles/glossary/_category_.json (100%)
 rename docs/{ => source}/lang/articles/glossary/glossary.md (100%)
 rename docs/{ => source}/lang/articles/internals/_category_.json (100%)
 rename docs/{ => source}/lang/articles/internals/compilation.md (100%)
 rename docs/{ => source}/lang/articles/internals/internal.md (100%)
 rename docs/{ => source}/lang/articles/internals/life_of_kernel_lowres.jpg (100%)
 rename docs/{ => source}/lang/articles/kernels/_category_.json (100%)
 rename docs/{ => source}/lang/articles/kernels/kernel_function.md (100%)
 rename docs/{ => source}/lang/articles/kernels/kernel_sync.md (100%)
 rename docs/{ => source}/lang/articles/math/_category_.json (100%)
 rename docs/{ => source}/lang/articles/math/linear_solver.md (100%)
 rename docs/{ => source}/lang/articles/math/math_module.md (100%)
 rename docs/{ => source}/lang/articles/math/sparse_matrix.md (100%)
 rename docs/{ => source}/lang/articles/performance_tuning/_category_.json (100%)
 rename docs/{ => source}/lang/articles/performance_tuning/performance.md (100%)
 rename docs/{ => source}/lang/articles/performance_tuning/profiler.md (100%)
 rename docs/{ => source}/lang/articles/reference/_category_.json (100%)
 rename docs/{ => source}/lang/articles/reference/differences_between_taichi_and_python_programs.md (100%)
 rename docs/{ => source}/lang/articles/reference/global_settings.md (100%)
 rename docs/{ => source}/lang/articles/reference/language_reference.md (100%)
 rename docs/{ => source}/lang/articles/reference/operator.md (100%)
 rename docs/{ => source}/lang/articles/reference/simt.md (100%)
 rename docs/{ => source}/lang/articles/reference/syntax_sugars.md (100%)
 rename docs/{ => source}/lang/articles/static/assets/aot_tutorial.png (100%)
 rename docs/{ => source}/lang/articles/static/assets/arrow_field.png (100%)
 rename docs/{ => source}/lang/articles/static/assets/arrows.png (100%)
 rename docs/{ => source}/lang/articles/static/assets/bitpacked_fields_layout_example.png (100%)
 rename docs/{ => source}/lang/articles/static/assets/bls_indices_mapping.png (100%)
 rename docs/{ => source}/lang/articles/static/assets/circles.png (100%)
 rename docs/{ => source}/lang/articles/static/assets/colored_circles.png (100%)
 rename docs/{ => source}/lang/articles/static/assets/effect_of_offline_cache.png (100%)
 rename docs/{ => source}/lang/articles/static/assets/floating-point_formats.png (100%)
 rename docs/{ => source}/lang/articles/static/assets/fractal.png (100%)
 rename docs/{ => source}/lang/articles/static/assets/lines.png (100%)
 rename docs/{ => source}/lang/articles/static/assets/point_field.png (100%)
 rename docs/{ => source}/lang/articles/static/assets/quant_array_layout_example.png (100%)
 rename docs/{ => source}/lang/articles/static/assets/rect.png (100%)
 rename docs/{ => source}/lang/articles/static/assets/runtime.png (100%)
 rename docs/{ => source}/lang/articles/static/assets/triangles.png (100%)
 rename docs/{ => source}/lang/articles/type_system/_category_.json (100%)
 rename docs/{ => source}/lang/articles/type_system/type.md (100%)
 rename docs/{ => source}/lang/articles/utilities/_category_.json (100%)
 rename docs/{ => source}/lang/articles/visualization/_category_.json (100%)
 rename docs/{ => source}/lang/articles/visualization/export_results.md (100%)
 rename docs/{ => source}/lang/articles/visualization/ggui.md (100%)
 rename docs/{ => source}/lang/articles/visualization/gui_system.md (100%)
 rename docs/{ => source}/rfcs/20220410-rfc-process.md (100%)
 rename docs/{ => source}/rfcs/20220413-aot-for-all-snode.md (100%)
 rename docs/{ => source}/rfcs/res/.gitkeep (100%)
 rename docs/{ => source}/rfcs/yyyymmdd-rfc-template.md (100%)
 rename docs/{ => source}/variable.json (100%)
 create mode 100644 docs/sphinx/_toc.yml.in
 create mode 100644 docs/sphinx/requirements.in
 create mode 100644 docs/sphinx/requirements.txt
 create mode 100644 docs/what-is-taichi.rst
 create mode 100644 readthedocs.yaml

diff --git a/.wordlist.txt b/.wordlist.txt
new file mode 100644
index 0000000000000..c039ac3b71a1a
--- /dev/null
+++ b/.wordlist.txt
@@ -0,0 +1,865 @@
+AAC
+ABI
+ACE
+ACEs
+ACS
+AccVGPR
+AccVGPRs
+ALU
+AMD
+AMDGPU
+AMDGPUs
+AMDMIGraphX
+AMI
+AOCC
+AOMP
+AOTriton
+APBDIS
+APIC
+APIs
+APU
+ASIC
+ASICs
+ASan
+ASAN
+ASm
+ATI
+AddressSanitizer
+AlexNet
+Andrej
+Arb
+Autocast
+BARs
+BLAS
+BMC
+Blit
+Blockwise
+Bluefield
+Bootloader
+CCD
+CDNA
+CHTML
+CIFAR
+CLI
+CLion
+CMake
+CMakeLists
+CMakePackage
+CP
+CPC
+CPF
+CPP
+CPU
+CPUs
+Cron
+CSC
+CSE
+CSV
+CSn
+CTest
+CTests
+CU
+CUDA
+CUs
+CXX
+Cavium
+CentOS
+ChatGPT
+CoRR
+Codespaces
+Commitizen
+CommonMark
+Concretized
+Conda
+ConnectX
+CuPy
+Dashboarding
+DDR
+DF
+DGEMM
+DIMM
+DKMS
+DL
+DMA
+DNN
+DNNL
+DPM
+DRI
+DW
+DWORD
+Dask
+DataFrame
+DataLoader
+DataParallel
+Debian
+DeepSpeed
+Dependabot
+Deprecations
+DevCap
+Dockerfile
+Doxygen
+ELMo
+ENDPGM
+EPYC
+ESXi
+EoS
+FBGEMM
+FFT
+FFTs
+FFmpeg
+FHS
+FIXME
+FMA
+FP
+FX
+Filesystem
+FindDb
+Flang
+Fortran
+Fuyu
+GALB
+GCC
+GCD
+GCDs
+GCN
+GDB
+GDDR
+GDR
+GDS
+GEMM
+GEMMs
+GFortran
+GiB
+GIM
+GL
+GLXT
+Gloo
+GMI
+GPG
+GPR
+GPT
+GPU
+GPU's
+GPUs
+GRBM
+GenAI
+GenZ
+GitHub
+Gitpod
+HBM
+HCA
+HGX
+HIPCC
+HIPExtension
+HIPIFY
+HIPification
+HIPify
+HPC
+HPCG
+HPE
+HPL
+HSA
+HW
+HWE
+HWS
+Haswell
+Higgs
+Hyperparameters
+Huggingface
+ICD
+ICV
+IDE
+IDEs
+IFWI
+IMDb
+IOMMU
+IOP
+IOPM
+IOV
+IRQ
+ISA
+ISV
+ISVs
+ITL
+ImageNet
+InfiniBand
+Inlines
+IntelliSense
+Interop
+Intersphinx
+Intra
+Ioffe
+JAX's
+Jinja
+JSON
+Jupyter
+KFD
+KFDTest
+KMD
+KV
+KVM
+Karpathy's
+KiB
+Keras
+Khronos
+LAPACK
+LCLK
+LDS
+LLM
+LLMs
+LLVM
+LM
+LSAN
+LSan
+LTS
+LoRA
+MEM
+MERCHANTABILITY
+MFMA
+MiB
+MIGraphX
+MIOpen
+MIOpenGEMM
+MIOpen's
+MIVisionX
+MLM
+MMA
+MMIO
+MMIOH
+MMU
+MNIST
+MPI
+MSVC
+MVAPICH
+MVFFR
+Makefile
+Makefiles
+Matplotlib
+Matrox
+Megatrends
+Megatron
+Mellanox
+Mellanox's
+Meta's
+Miniconda
+MirroredStrategy
+Mixtral
+Multicore
+Multithreaded
+MyEnvironment
+MyST
+NBIO
+NBIOs
+NCCL
+NCF
+NIC
+NICs
+NLI
+NLP
+NPKit
+NPS
+NSP
+NUMA
+NVCC
+NVIDIA
+NVPTX
+NaN
+Nano
+Navi
+Noncoherently
+NousResearch's
+NumPy
+OAM
+OAMs
+OCP
+OEM
+OFED
+OMM
+OMP
+OMPI
+OMPT
+OMPX
+ONNX
+OSS
+OSU
+OpenCL
+OpenCV
+OpenFabrics
+OpenGL
+OpenMP
+OpenMPI
+OpenSSL
+OpenVX
+OpenXLA
+Oversubscription
+PagedAttention
+Pallas
+PCC
+PCI
+PCIe
+PEFT
+PEQT
+PIL
+PILImage
+POR
+PRNG
+PRs
+PaLM
+Pageable
+PeerDirect
+PerfDb
+Perfetto
+PipelineParallel
+PnP
+PowerEdge
+PowerShell
+Profiler's
+PyPi
+Pytest
+PyTorch
+Qcycles
+Qwen
+RAII
+RAS
+RCCL
+RDC
+RDC's
+RDMA
+RDNA
+README
+RHEL
+RMW
+RNN
+RNNs
+ROC
+ROCProfiler
+ROCT
+ROCTracer
+ROCclr
+ROCdbgapi
+ROCgdb
+ROCk
+ROCm
+ROCmCC
+ROCmSoftwarePlatform
+ROCmValidationSuite
+ROCprofiler
+ROCr
+RPP
+RST
+RW
+Radeon
+RelWithDebInfo
+Req
+Rickle
+RoCE
+Runfile
+Ryzen
+SALU
+SBIOS
+SCA
+SDK
+SDMA
+SDPA
+SDRAM
+SENDMSG
+SGPR
+SGPRs
+SHA
+SHARK's
+SIGQUIT
+SIMD
+SIMDs
+SKU
+SKUs
+SLES
+SMEM
+SMI
+SMT
+SPI
+SQs
+SRAM
+SRAMECC
+SVD
+SWE
+SerDes
+ShareGPT
+Shlens
+Skylake
+Softmax
+Spack
+SplitK
+Supermicro
+Szegedy
+TCA
+TCC
+TCI
+TCIU
+TCP
+TCR
+TF
+TFLOPS
+TP
+TPS
+TPU
+TPUs
+TSME
+Tagram
+TensileLite
+TensorBoard
+TensorFlow
+TensorParallel
+ToC
+TorchAudio
+torchaudio
+TorchElastic
+TorchMIGraphX
+torchrec
+TorchScript
+TorchServe
+torchserve
+torchtext
+TorchVision
+TransferBench
+TrapStatus
+UAC
+UC
+UCC
+UCX
+UE
+UIF
+UMC
+USM
+UTCL
+UTIL
+Uncached
+Unittests
+Unhandled
+VALU
+VBIOS
+VGPR
+VGPRs
+VM
+VMEM
+VMWare
+VRAM
+VSIX
+VSkipped
+Vanhoucke
+Vulkan
+WGP
+WGPs
+WX
+WikiText
+Wojna
+Workgroups
+Writebacks
+XCD
+XCDs
+XGBoost
+XGBoost's
+XGMI
+XT
+XTX
+Xeon
+Xilinx
+Xnack
+Xteam
+YAML
+YML
+YModel
+ZeRO
+ZenDNN
+accuracies
+activations
+addr
+alloc
+allocatable
+allocator
+allocators
+amdgpu
+api
+atmi
+atomics
+autogenerated
+autotune
+avx
+awk
+backend
+backends
+benchmarked
+benchmarking
+bfloat
+bilinear
+bitcode
+bitsandbytes
+blit
+bootloader
+boson
+bosons
+br
+buildable
+bursty
+bzip
+cacheable
+cd
+centos
+centric
+changelog
+chiplet
+cmake
+cmd
+coalescable
+codename
+collater
+comgr
+completers
+composable
+concretization
+config
+conformant
+constructible
+convolutional
+convolves
+copyable
+cpp
+csn
+cuBLAS
+cuda
+cuDNN
+cudnn
+cuFFT
+cuLIB
+cuRAND
+cuSOLVER
+cuSPARSE
+customizations
+cTDP
+dataset
+datasets
+dataspace
+datatype
+datatypes
+dbgapi
+de
+deallocation
+debuggability
+debian
+denoise
+denoised
+denoises
+denormalize
+dequantization
+dequantizes
+deserializers
+detections
+dev
+devicelibs
+devsel
+dimensionality
+disambiguates
+distro
+distros
+dkms
+dtype
+el
+embeddings
+enablement
+encodings
+endpgm
+enqueue
+env
+epilog
+etcetera
+ethernet
+exascale
+executables
+ffmpeg
+filesystem
+fortran
+fp
+gRPC
+galb
+gcc
+gdb
+gfortran
+gfx
+githooks
+github
+globals
+gnupg
+grayscale
+gzip
+heterogenous
+hipBLAS
+hipBLASLt
+hipBLASLt's
+hipblaslt
+hipCUB
+hipFFT
+hipFORT
+hipLIB
+hipRAND
+hipSOLVER
+hipSPARSE
+hipSPARSELt
+hipTensor
+hipamd
+hipblas
+hipcub
+hipfft
+hipfort
+hipify
+hipsolver
+hipsparse
+hlist
+hotspotting
+hpc
+hpp
+hsa
+hsakmt
+hyperparameter
+hyperparameters
+iDRAC
+ib_core
+inband
+incrementing
+inductor
+inferencing
+inflight
+init
+initializer
+inlining
+installable
+interop
+interprocedural
+intra
+invariants
+invocating
+ipo
+jax
+kdb
+kfd
+kv
+lang
+latencies
+len
+libfabric
+libjpeg
+libs
+linearized
+linter
+linux
+llvm
+localscratch
+logits
+lossy
+macOS
+matchers
+microarchitecture
+migraphx
+miopen
+miopengemm
+mivisionx
+mjx
+mkdir
+mlirmiopen
+mtypes
+mutex
+mvffr
+namespace
+namespaces
+nanoGPT
+num
+numref
+ocl
+opencl
+opencv
+openmp
+openssl
+optimizers
+os
+oversubscription
+pageable
+pallas
+parallelization
+parallelizing
+parameterization
+passthrough
+perfcounter
+performant
+perl
+pragma
+pre
+prebuild
+prebuilt
+precompiled
+preconditioner
+preconfigured
+preemptible
+prefetch
+prefetchable
+prefill
+prefills
+preloaded
+preprocess
+preprocessed
+preprocessing
+preprocessor
+prequantized
+prerequisites
+profiler
+profilers
+protobuf
+pseudorandom
+py
+recommender
+recommenders
+quantile
+quantizer
+quasirandom
+queueing
+rccl
+rdc
+rdma
+reStructuredText
+redirections
+refactorization
+reformats
+repo
+repos
+representativeness
+req
+resampling
+rescaling
+reusability
+roadmap
+roc
+rocAL
+rocALUTION
+rocBLAS
+rocDecode
+rocFFT
+rocHPCG
+rocJPEG
+rocLIB
+rocMLIR
+rocPRIM
+rocPyDecode
+rocRAND
+rocSOLVER
+rocSPARSE
+rocThrust
+rocWMMA
+rocalution
+rocblas
+rocclr
+rocfft
+rocm
+rocminfo
+rocprim
+rocprof
+rocprofiler
+rocr
+rocrand
+rocsolver
+rocsparse
+rocthrust
+roctracer
+rst
+runtime
+runtimes
+sL
+scalability
+scalable
+scipy
+seealso
+sendmsg
+seqs
+serializers
+shader
+sharding
+sigmoid
+sm
+smi
+softmax
+spack
+src
+stochastically
+strided
+subcommand
+subdirectory
+subexpression
+subfolder
+subfolders
+submodule
+submodules
+supercomputing
+symlink
+symlinks
+sys
+td
+tensorfloat
+th
+tokenization
+tokenize
+tokenized
+tokenizer
+tokenizes
+toolchain
+toolchains
+toolset
+toolsets
+torchvision
+tqdm
+tracebacks
+txt
+uarch
+uncached
+uncorrectable
+underoptimized
+unhandled
+uninstallation
+unmapped
+unsqueeze
+unstacking
+unswitching
+untrusted
+untuned
+upvote
+USM
+UTCL
+UTIL
+utils
+vL
+variational
+vdi
+vectorizable
+vectorization
+vectorize
+vectorized
+vectorizer
+vectorizes
+virtualize
+virtualized
+vjxb
+voxel
+walkthrough
+walkthroughs
+watchpoints
+wavefront
+wavefronts
+whitespace
+whitespaces
+workgroup
+workgroups
+writeback
+writebacks
+wrreq
+wzo
+xargs
+xGMI
+xz
+yaml
+ysvmadyb
+zypper
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 96d6c92301471..54eadfb97b798 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,8 @@
 # The Taichi Programming Language
 #*********************************************************************
 
+rocm_setup_version(VERSION 1.8.0b2)
+
 cmake_minimum_required(VERSION 3.17)
 
 project(taichi)
diff --git a/README.md b/README.md
index b7f06d3c48151..6e44f1d45a7d7 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ The language has broad applications spanning real-time physical simulation, nume
 - Flexibility: Taichi Lang provides a set of generic data containers known as *SNode* (/ˈsnoʊd/), an effective mechanism for composing hierarchical, multi-dimensional fields. This can cover many use patterns in numerical simulation (e.g. [spatially sparse computing](https://docs.taichi-lang.org/docs/sparse)).
 - Performance: With the `@ti.kernel` decorator, Taichi Lang's JIT compiler automatically compiles your Python functions into efficient GPU or CPU machine code for parallel execution.
 - Portability: Write your code once and run it everywhere. Currently, Taichi Lang supports most mainstream GPU APIs, such as CUDA and Vulkan.
-- ... and many more features! A cross-platform, Vulkan-based 3D visualizer, [differentiable programming](https://docs.taichi-lang.org/docs/differentiable_programming),  [quantized computation](https://github.com/taichi-dev/quantaichi) (experimental), etc.
+- Additional feature support: A cross-platform, Vulkan-based 3D visualizer, [differentiable programming](https://docs.taichi-lang.org/docs/differentiable_programming),  [quantized computation](https://github.com/taichi-dev/quantaichi) (experimental), and many more.
 
 ## Getting Started
 
@@ -69,7 +69,7 @@ Use Python's package installer **pip** to install Taichi Lang:
 pip install --upgrade taichi
 ```
 
-*We also provide a nightly package. Note that nightly packages may crash because they are not fully tested.  We cannot guarantee their validity, and you are at your own risk trying out our latest, untested features. The nightly packages can be installed from our self-hosted PyPI (Using self-hosted PyPI allows us to provide more frequent releases over a longer period of time)*
+*Nightly packages are also provided. Note: nightly packages may crash because they are not fully tested.  The AMD ROCm Taichi team does not guarantee their validity, and you are at your own risk trying out latest, untested features. The nightly packages can be installed from our self-hosted PyPI (Using self-hosted PyPI allows us to provide more frequent releases over a longer period of time)*
 
 ```bash
 pip install -i https://pypi.taichi.graphics/simple/ taichi-nightly
@@ -139,7 +139,7 @@ If you wish to try our experimental features or build Taichi Lang for your own e
 
 ## Contributing
 
-Kudos to all of our amazing contributors! Taichi Lang thrives through open-source. In that spirit, we welcome all kinds of contributions from the community. If you would like to participate, check out the [Contribution Guidelines](CONTRIBUTING.md) first.
+Thank you to all our amazing contributors! Taichi Lang thrives through open-source. In that spirit, we welcome all kinds of contributions from the community. If you would like to participate, review the [Contribution Guidelines](CONTRIBUTING.md) to get started.
 
 <a href="https://github.com/taichi-dev/taichi/graphs/contributors"><img src="https://raw.githubusercontent.com/taichi-dev/public_files/master/taichi/contributors_taichi-dev_taichi_18.png" width="800px"></a>
 
@@ -202,7 +202,7 @@ For more information about the events or community, please refer to [this page](
 
 ### Citations
 
-If you use Taichi Lang in your research, please cite the corresponding papers:
+To use Taichi Lang in your research, citations for corresponding papers are as follows:
 
 - [**(SIGGRAPH Asia 2019) Taichi: High-Performance Computation on Sparse Data Structures**](https://yuanming.taichi.graphics/publication/2019-taichi/taichi-lang.pdf) [[Video]](https://youtu.be/wKw8LMF3Djo) [[BibTex]](https://raw.githubusercontent.com/taichi-dev/taichi/master/misc/taichi_bibtex.txt) [[Code]](https://github.com/taichi-dev/taichi)
 - [**(ICLR 2020) DiffTaichi: Differentiable Programming for Physical Simulation**](https://arxiv.org/abs/1910.00935) [[Video]](https://www.youtube.com/watch?v=Z1xvAZve9aE) [[BibTex]](https://raw.githubusercontent.com/taichi-dev/taichi/master/misc/difftaichi_bibtex.txt) [[Code]](https://github.com/yuanming-hu/difftaichi)
diff --git a/docs/about/license.rst b/docs/about/license.rst
new file mode 100644
index 0000000000000..85a8ca410fbb2
--- /dev/null
+++ b/docs/about/license.rst
@@ -0,0 +1,12 @@
+.. meta::
+  :description: Taichi Lang license
+  :keywords: Taichi Lang, parallel programming, ROCm, developer, reference, python
+
+.. _license:
+
+******************************************
+License
+******************************************
+
+.. include:: ../../LICENSE
+  :literal:
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000000000..8ae88ec38c22a
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,58 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+import re
+
+'''
+html_theme is usually unchanged (rocm_docs_theme).
+flavor defines the site header display, select the flavor for the corresponding portals
+flavor options: rocm, rocm-docs-home, rocm-blogs, rocm-ds, instinct, ai-developer-hub, local, generic
+'''
+html_theme = "rocm_docs_theme"
+html_theme_options = {"flavor": "rocm-simulation", "repository_url": "https://github.com/ROCm/taichi/"}
+
+'''
+docs_header_version is used to manually configure the version in the header. If
+there exists a non-null value mapped to docs_header_version, then the header in
+the documentation page will contain the given version string.
+'''
+html_context = {
+    "docs_header_version": "25.11"
+}
+
+
+# This section turns on/off article info
+setting_all_article_info = True
+all_article_info_os = ["linux"]
+all_article_info_author = ""
+
+# Dynamically extract component version
+with open('../CMakeLists.txt', encoding='utf-8') as f:
+    pattern = r'.*\brocm_setup_version\(VERSION\s+([0-9A-Za-z._-]+)' # Update according to each component's CMakeLists.txt
+    match = re.search(pattern,
+                      f.read())
+    if not match:
+        raise ValueError("VERSION not found!")
+    version_number = match[1]
+
+# for PDF output on Read the Docs
+project = "Taichi Lang"
+author = "Advanced Micro Devices, Inc."
+copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
+version = version_number
+release = version_number
+
+external_toc_path = "./sphinx/_toc.yml" # Defines Table of Content structure definition path
+
+# Add more addtional package accordingly
+extensions = [
+    "rocm_docs", 
+    "sphinx.ext.autodoc", # for Python docstrings
+] 
+
+html_title = f"{project} {version_number} documentation"
+
+external_projects_current_project = "Taichi Lang"
diff --git a/docs/examples/taichi-examples.rst b/docs/examples/taichi-examples.rst
new file mode 100644
index 0000000000000..b4f0683fda779
--- /dev/null
+++ b/docs/examples/taichi-examples.rst
@@ -0,0 +1,140 @@
+.. meta::
+  :description: Taichi Lang examples
+  :keywords: Taichi Lang, JIT, parallel, programming, python, ROCm, example, sample, tutorial
+
+.. _run-a-taichi-example:
+
+********************************************************************
+Run a Taichi Lang example
+********************************************************************
+
+Prepared Taichi Lang examples using decorators
+================================================================================
+
+The following examples show you how to use decorators, and is organized by use case.
+
+Count primes
+--------------------------------------------------------------------------------
+
+In this example, the function ``is_prime`` will be used in the kernel ``count_primes``. 
+The code below is written as a Taichi program by decorating ``is_prime``
+with the Taichi Lang decorator ``@ti.func`` and decorating ``count_primes`` with the Taichi
+Lang decorator ``@ti.kernel``.
+
+1. To run this example, copy the code below to a file named ``count_primes.py``:
+
+.. code-block:: bash
+
+   import taichi as ti
+   ti.init(arch=ti.gpu)
+
+   @ti.func
+   def is_prime(n: int):
+      result = True
+      for k in range(2, int(n ** 0.5) + 1):
+         if n % k == 0:
+               result = False
+               break
+      return result
+
+   @ti.kernel
+   def count_primes(n: int) -> int:
+      count = 0
+      for k in range(2, n):
+         if is_prime(k):
+               count += 1
+
+      return count
+
+   print(count_primes(1000000))
+
+2. Once this file has been created, execute the code in your Docker container with the following command:
+
+.. code-block:: bash
+
+   python3 count_primes.py
+
+3. The output should be similar to the output below:
+
+.. code-block:: bash
+
+   [Taichi] version 1.8.0b2, llvm 20.0.0, commit f7911653, linux, python 3.12.3
+   [Taichi] Starting on arch=amdgpu
+   78498
+
+Longest common subsequence 
+--------------------------------------------------------------------------------
+
+This is an example of longest common subsequence kernel. You don't
+need a helper function, so the only decorator needed is ``@ti.kernel`` to 
+accelerate the kernel function ``compute_lcs``.
+
+1. To run this example, copy the code below into a file named ``lcs.py``:
+
+.. code-block:: bash
+
+   import taichi as ti
+   import numpy as np
+
+   ti.init(arch=ti.gpu)
+
+   benchmark = True
+
+   N = 15000
+
+   f = ti.field(dtype=ti.i32, shape=(N + 1, N + 1))
+
+   if benchmark:
+      a_numpy = np.random.randint(0, 100, N, dtype=np.int32)
+      b_numpy = np.random.randint(0, 100, N, dtype=np.int32)
+   else:
+      a_numpy = np.array([0, 1, 0, 2, 4, 3, 1, 2, 1], dtype=np.int32)
+      b_numpy = np.array([4, 0, 1, 4, 5, 3, 1, 2], dtype=np.int32)
+
+   @ti.kernel
+   def compute_lcs(a: ti.types.ndarray(), b: ti.types.ndarray()) -> ti.i32:
+      len_a, len_b = a.shape[0], b.shape[0]
+
+      ti.loop_config(serialize=True) # Disable auto-parallelism in Taichi
+      for i in range(1, len_a + 1):
+         for j in range(1, len_b + 1):
+                  f[i, j] = ti.max(f[i - 1, j - 1] + (a[i - 1] == b[j - 1]),
+                           ti.max(f[i - 1, j], f[i, j - 1]))
+
+      return f[len_a, len_b]
+
+
+   print(compute_lcs(a_numpy, b_numpy))
+
+2. Once this file has been created, execute the code in your Docker container with the following command:
+
+.. code-block:: bash
+
+   python3 lcs.py
+
+3. The output should be similar to the output below:
+
+.. code-block:: bash
+
+   [Taichi] version 1.8.0b2, llvm 20.0.0, commit f7911653, linux, python 3.12.3
+   [Taichi] Starting on arch=amdgpu
+   2706
+
+Use cases and recommendations
+================================================================================
+
+* The `Modernizing Taichi Lang to LLVM 20 for MI325X GPU Acceleration
+  <https://rocm.blogs.amd.com/artificial-intelligence/taichi_mi300x/README.html>`__
+  blog highlights Taichi Lang as an open-source programming language designed for high-performance
+  numerical computation, particularly in domains such as real-time physical simulation,
+  artificial intelligence, computer vision, robotics, and visual effects. Taichi Lang
+  is embedded in Python and uses just-in-time (JIT) compilation frameworks like
+  LLVM to optimize execution on GPUs and CPUs. The blog emphasizes the versatility
+  of Taichi Lang in enabling complex simulations and numerical algorithms, making
+  it ideal for developers working on compute-intensive tasks. Developers are
+  encouraged to follow recommended coding patterns and utilize Taichi Lang decorators
+  for performance optimization. Prebuilt Docker images integrating ROCm, PyTorch, and
+  Taichi are provided for simplified installation and deployment, making it easier
+  to leverage Taichi Lang for advanced computational workloads.
+
+Refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__ to search for Taichi examples and best practices to optimize your workflows on AMD GPUs.
diff --git a/docs/examples/taichi-get-started.rst b/docs/examples/taichi-get-started.rst
new file mode 100644
index 0000000000000..56207fd6bfb00
--- /dev/null
+++ b/docs/examples/taichi-get-started.rst
@@ -0,0 +1,64 @@
+.. meta::
+  :description: Get started using Taichi Lang
+  :keywords: Taichi Lang, JIT, parallel, programming, python, ROCm, example, sample, tutorial
+
+.. _taichi-get-started:
+
+********************************************************************
+Get started with Taichi Lang
+********************************************************************
+
+When writing compute-intensive tasks, you can make use of the two 
+decorators ``@ti.func`` and ``@ti.kernel``. Functions decorated 
+with ``@ti.kernel`` are kernels that serve as the entry points where
+Taichi’s runtime takes over the tasks, and they must be directly
+invoked by Python code. Functions decorated with ``@ti.func`` are
+building blocks of kernels and can only be invoked by another
+Taichi Lang function or a kernel. These decorators instruct Taichi
+Lang to take over the computation tasks and compile the decorated
+functions to machine code using just-in-time (JIT) compiler. As a
+result, calls to these functions are executed on multi-core CPUs
+or GPUs.
+
+Below you can see how simple it is to use the Taichi Lang ``@ti.func``
+and ``@ti.kernel`` decorators to accelerate Python code. First, you 
+can see the Python code without using Taichi Lang. To enact Taichi Lang 
+in this example, the function ``inv_square`` serves as a building 
+block function for the kernel ``partial_sum``.
+
+The example Python code without Taichi Lang:
+
+.. code-block:: bash
+
+    def inv_square(x):  # A function
+        return 1.0 / (x * x)
+
+    def partial_sum(n: int) -> float:  # A kernel
+        total = 0.0
+        for i in range(1, n + 1):
+            total += inv_square(i)
+        return total
+
+    partial_sum(1000)
+
+1. To write this code in Taichi Lang, import and initialize Taichi Lang for code acceleration:
+
+.. code-block:: bash
+   
+   import taichi as ti
+   ti.init(arch=ti.gpu)
+
+2. Then, you can decorate the building block function and kernel with the ``@ti.func`` and ``@ti.kernel`` decorators, respectively:
+
+.. code-block:: bash
+
+   @ti.func
+   def inv_square(x):  # A Taichi function
+       return 1.0 / (x * x)
+
+   @ti.kernel
+   def partial_sum(n: int) -> float:  # A kernel
+       total = 0.0
+       for i in range(1, n + 1):
+           total += inv_square(i)
+       return total
\ No newline at end of file
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000000000..8ecc6431ad235
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,39 @@
+.. meta::
+  :description: Taichi Lang documentation
+  :keywords: Taichi Lang, ROCm, documentation, LLVM, JIT, parallel programming, GPU
+
+.. _taichi-documentation-index:
+
+********************************************************************
+Taichi Lang documentation
+********************************************************************
+
+`Taichi Lang <https://www.taichi-lang.org/>`_ is an open-source, imperative, and parallel 
+programming language designed for high-performance numerical computation. Taichi Lang is 
+part of the AMD ROCm™ Simulation Domain toolkit (ROCm-Simulation). Embedded in Python,
+it leverages just-in-time (JIT) compilation frameworks such as LLVM to accelerate 
+compute-intensive Python code by compiling it to native GPU or CPU instructions.
+
+Taichi Lang is widely used across various domains, such as real-time physical simulation, 
+numerical computing, augmented reality, artificial intelligence, computer vision, robotics, 
+visual effects for film and gaming, and general-purpose computing.
+
+The Taichi Lang public repository is located at `https://github.com/ROCm/taichi <https://github.com/ROCm/taichi>`__.
+
+.. grid:: 2
+  :gutter: 3
+
+  .. grid-item-card:: Install
+
+    * :doc:`Install Taichi Lang <install/taichi-install>`
+
+  .. grid-item-card:: Examples
+
+    * :doc:`Get started with Taichi Lang <examples/taichi-get-started>`  
+    * :doc:`Run a Taichi Lang example <examples/taichi-examples>`  
+
+
+To contribute to the documentation, refer to
+`Contributing to ROCm <https://rocm.docs.amd.com/en/latest/contribute/contributing.html>`_.
+
+You can find licensing information on the :doc:`Licensing <about/license>` page.
diff --git a/docs/install/taichi-install.rst b/docs/install/taichi-install.rst
new file mode 100644
index 0000000000000..9d26d7b203fc5
--- /dev/null
+++ b/docs/install/taichi-install.rst
@@ -0,0 +1,289 @@
+.. meta::
+  :description: installing Taichi Lang for ROCm
+  :keywords: installation instructions, parallel programming, JIT, LLVM, AMD, ROCm, Taichi Lang
+
+.. _taichi-on-rocm-installation:
+
+********************************************************************
+Taichi Lang on ROCm installation
+********************************************************************
+
+System requirements
+====================================================================
+
+To use Taichi Lang `1.8.0b2 <https://github.com/ROCm/taichi/tree/release/1.8.0b2>`__, you need the following prerequisites:
+
+- **ROCm version:** `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__ (recommended)
+- **Operating system:** Ubuntu 22.04, 24.04
+- **GPU platform:** AMD Instinct™ MI355X, MI325X, MI300X, MI250X, MI210
+- **Python:** `3.12.3 <https://www.python.org/downloads/release/python-3123/>`__, `3.10.12 <https://www.python.org/downloads/release/python-31012/>`__
+
+Install Taichi
+================================================================================
+
+To install Taichi Lang on ROCm, you have the following options:
+
+- :ref:`Use the prebuilt Docker image <using-docker-with-taichi-pre-installed>` **(recommended)**
+- :ref:`Use a wheels package <taichi-wheels-package>`
+- :ref:`Build your own docker image <build-taichi-rocm-docker-image>`
+
+.. _using-docker-with-taichi-pre-installed:
+
+Use a prebuilt Docker image with Taichi Lang pre-installed
+--------------------------------------------------------------------------------
+
+Docker is the recommended method to set up a Taichi Lang environment, as it avoids potential installation issues. 
+The tested, prebuilt image includes Taichi, Python, ROCm, and other dependencies.
+
+1. Pull the Docker image:
+
+   .. tab-set::
+
+      .. tab-item:: Ubuntu 24.04
+         :sync: ubuntu-24
+
+         .. code-block:: shell
+
+            docker pull rocm/taichi:taichi-1.8.0b2_rocm7.0.0_ubuntu24.04_py3.12.3
+
+         See `rocm/taichi:taichi-1.8.0b2_rocm7.0.0_ubuntu24.04_py3.12.3
+         <https://hub.docker.com/layers/rocm/taichi/taichi-1.8.0b2_rocm7.0.0_ubuntu24.04_py3.12.3/images/sha256-15bb5ad69e0dff0cc5e0805a5d271a82d04d6d9c32d668bfcf34981ca99f68eb>`__
+         on Docker Hub.
+
+      .. tab-item:: Ubuntu 22.04
+         :sync: ubuntu-22
+
+         .. code-block:: shell
+
+            docker pull rocm/taichi:taichi-1.8.0b2_rocm7.0.0_ubuntu22.04_py3.10.12
+
+         See `rocm/taichi:taichi-1.8.0b2_rocm7.0.0_ubuntu22.04_py3.10.12
+         <https://hub.docker.com/layers/rocm/taichi/taichi-1.8.0b2_rocm7.0.0_ubuntu22.04_py3.10.12/images/sha256-151516ac981ab1309fca0d868e25dd1231728449b6563b5e8c660d7cb47777fb>`__
+         on Docker Hub.
+
+2. Launch and connect to the container:
+
+   .. tab-set::
+
+      .. tab-item:: Ubuntu 24.04
+         :sync: ubuntu-24
+
+         .. code-block:: shell
+
+            docker run -it -d \
+               --cap-add=SYS_PTRACE \
+               --security-opt seccomp=unconfined \
+               --ipc=host \
+               --shm-size=64G \
+               --network=host \
+               --device=/dev/kfd \
+               --device=/dev/dri \
+               --group-add video \
+               -v "$(pwd)":/taichi_dir \
+               --name rocm_taichi \
+               rocm/taichi:taichi-1.8.0b2_rocm7.0.0_ubuntu24.04_py3.12.3
+
+      .. tab-item:: Ubuntu 22.04
+         :sync: ubuntu-22
+
+         .. code-block:: shell
+
+            docker run -it -d \
+               --cap-add=SYS_PTRACE \
+               --security-opt seccomp=unconfined \
+               --ipc=host \
+               --shm-size=64G \
+               --network=host \
+               --device=/dev/kfd \
+               --device=/dev/dri \
+               --group-add video \
+               -v "$(pwd)":/taichi_dir \
+               --name rocm_taichi \
+               rocm/taichi:taichi-1.8.0b2_rocm7.0.0_ubuntu22.04_py3.10.12
+
+.. _taichi-wheels-package:
+
+Use a wheels package
+--------------------------------------------------------------------------------
+
+The Taichi Lang ``.whl`` packages are hosted on the AMD PyPI repository. 
+Instead of manually downloading the files, you can simply install Taichi Lang using ``pip`` with the provided URL. 
+This command will automatically download and install the appropriate ``.whl`` file.
+
+.. code-block:: bash
+
+   pip install amd-taichi==1.8.0b2 --extra-index-url=https://pypi.amd.com/simple
+   sudo apt-get update
+   sudo apt-get install -y lld
+
+.. _build-taichi-rocm-docker-image:
+
+Build your own Docker image
+--------------------------------------------------------------------------------
+
+If you prefer to use the ROCm Ubuntu image, or already have a ROCm Ubuntu container, follow these steps to install Taichi in the container.
+
+1. Pull the ROCm Ubuntu Docker image:
+
+   .. tab-set::
+
+      .. tab-item:: Ubuntu 24.04
+         :sync: ubuntu-24
+
+         .. code-block:: shell
+
+            docker pull rocm/dev-ubuntu-24.04:7.0-complete
+
+         See `rocm/dev-ubuntu-24.04:7.0-complete
+         <https://hub.docker.com/layers/rocm/dev-ubuntu-24.04/7.0-complete/images/sha256-ffd8ac00ca6c8e2dbfd0c364c7cc27542f90148f3f358d74efd028f67c33607b>`__
+         on Docker Hub.
+
+      .. tab-item:: Ubuntu 22.04
+         :sync: ubuntu-22
+
+         .. code-block:: shell
+
+            docker pull rocm/dev-ubuntu-22.04:7.0-complete
+
+         See `rocm/dev-ubuntu-22.04:7.0-complete
+         <https://hub.docker.com/layers/rocm/dev-ubuntu-22.04/7.0-complete/images/sha256-b4be4b0b29e46d56e9bea2cd06500f4519aaac30dc5df02bd4710bbf393c1c4c>`__
+         on Docker Hub.
+
+2. Launch the Docker container:
+
+   .. tab-set::
+
+      .. tab-item:: Ubuntu 24.04
+         :sync: ubuntu-24
+
+         .. code-block:: shell
+
+            docker run -it -d \
+                  --cap-add=SYS_PTRACE \
+                  --security-opt seccomp=unconfined \
+                  --ipc=host \
+                  --shm-size=64G \
+                  --network=host \
+                  --device=/dev/kfd \
+                  --device=/dev/dri \
+                  --group-add video \
+                  -v "$(pwd)":/taichi_dir \
+                  --name rocm_taichi \
+                  rocm/dev-ubuntu-24.04:7.0-complete \
+                  /bin/bash
+
+      .. tab-item:: Ubuntu 22.04
+         :sync: ubuntu-22
+
+         .. code-block:: shell
+
+            docker run -it -d \
+                  --cap-add=SYS_PTRACE \
+                  --security-opt seccomp=unconfined \
+                  --ipc=host \
+                  --shm-size=64G \
+                  --network=host \
+                  --device=/dev/kfd \
+                  --device=/dev/dri \
+                  --group-add video \
+                  -v "$(pwd)":/taichi_dir \
+                  --name rocm_taichi \
+                  rocm/dev-ubuntu-22.04:7.0-complete \
+                  /bin/bash
+
+3. Inside the running container, install build dependencies:
+
+   .. code-block:: bash
+
+      apt-get update && apt-get install -y --no-install-recommends \
+         git wget vim git freeglut3-dev libglfw3-dev libglm-dev \
+         libglu1-mesa-dev libjpeg-dev liblz4-dev libpng-dev \
+         libssl-dev libwayland-dev libx11-xcb-dev libxcb-dri3-dev \
+         libxcb-ewmh-dev libxcb-keysyms1-dev libxcb-randr0-dev \
+         libxcursor-dev libxi-dev libxinerama-dev libxrandr-dev \
+         libzstd-dev python3-pip cmake pybind11-dev \
+         ca-certificates python3-venv rocm-llvm-dev \
+         gdb python3-dbg 
+
+4. Setup LLVM:
+
+   .. code-block:: bash
+      
+      export LLVM_VERSION=20
+      export LLVM_PATH=/usr/lib/llvm-${LLVM_VERSION}
+      export PATH=${LLVM_PATH}/bin:$PATH
+      
+      wget https://apt.llvm.org/llvm.sh \
+         && chmod +x llvm.sh \
+         && apt-get update && apt-get install -y \
+         lsb-release software-properties-common gnupg \
+         && ./llvm.sh ${LLVM_VERSION} llvm clang lld
+
+5. Clone the `https://github.com/ROCm/taichi <https://github.com/ROCm/taichi>`_ repository with the desired branch:
+
+   .. code-block:: bash
+      
+      cd /taichi_dir
+      git clone --recursive https://github.com/ROCm/taichi -b release/v1.8.0b2
+      cd taichi
+
+6. Build the Taichi Lang wheel:
+
+   .. code-block:: bash
+
+      export GPU_TARGETS=gfx950,gfx942,gfx90a
+      export TAICHI_CMAKE_ARGS="-DTI_WITH_VULKAN=OFF -DTI_WITH_OPENGL=OFF -DTI_BUILD_TESTS=ON -DTI_BUILD_EXAMPLES=OFF -DCMAKE_PREFIX_PATH=${LLVM_PATH}/lib/cmake -DCMAKE_CXX_COMPILER=${LLVM_PATH}/bin/clang++ -DTI_WITH_AMDGPU=ON -DTI_WITH_CUDA=OFF -DTI_AMDGPU_ARCHS=${GPU_TARGETS} -DUSE_LLD=ON -DTI_WITH_LLVM=ON"
+
+      cd /taichi_dir/taichi/external/spdlog \
+      && git apply /taichi_dir/taichi/spdlog_fmt.patch \
+      && cd /taichi_dir/taichi \
+      && ./build.py
+
+7. Install the Taichi Lang ``.whl`` file:
+
+   .. code-block:: bash
+
+      python3 -m pip config set global.break-system-packages true \
+      && python3 -m pip install /taichi_dir/taichi/dist/taichi*.whl
+
+
+.. _build-taichi-docker-from-source:
+
+Test the Taichi Lang installation
+================================================================================
+
+Clone the `https://github.com/ROCm/taichi <https://github.com/ROCm/taichi>`_ repository:
+
+.. code-block:: bash
+
+   sudo apt-get update
+   sudo apt-get install -y git
+   git clone --recursive https://github.com/ROCm/taichi -b amd-release/v1.8.0b2
+
+To test the Taichi Lang installation, run the ``laplace`` example in the source code: 
+
+.. code-block:: bash
+
+   python3 taichi/python/taichi/examples/algorithm/laplace.py
+
+Example output using ``laplace``:
+
+.. code-block:: bash
+
+   [Taichi] version 1.8.0, llvm 15.0.0, commit f7911653, linux, python python 3.12.3
+   [Taichi] Starting on arch=amdgpu
+   0.0
+   4.0
+   0.0
+   0.0
+   4.0
+   0.0
+   0.0
+   4.0
+   0.0
+   0.0
+
+Run a Taichi Lang example
+====================================================================
+
+A set of examples is available to help you get started. See :doc:`run a Taichi Lang example <../examples/taichi-examples>` for more details.
diff --git a/docs/conftest.py b/docs/source/conftest.py
similarity index 100%
rename from docs/conftest.py
rename to docs/source/conftest.py
diff --git a/docs/cover-in-ci.lst b/docs/source/cover-in-ci.lst
similarity index 100%
rename from docs/cover-in-ci.lst
rename to docs/source/cover-in-ci.lst
diff --git a/docs/design/llvm_sparse_runtime.md b/docs/source/design/llvm_sparse_runtime.md
similarity index 100%
rename from docs/design/llvm_sparse_runtime.md
rename to docs/source/design/llvm_sparse_runtime.md
diff --git a/docs/fragments/.gitkeep b/docs/source/fragments/.gitkeep
similarity index 100%
rename from docs/fragments/.gitkeep
rename to docs/source/fragments/.gitkeep
diff --git a/docs/lang/articles/about/_category_.json b/docs/source/lang/articles/about/_category_.json
similarity index 100%
rename from docs/lang/articles/about/_category_.json
rename to docs/source/lang/articles/about/_category_.json
diff --git a/docs/lang/articles/about/overview.md b/docs/source/lang/articles/about/overview.md
similarity index 100%
rename from docs/lang/articles/about/overview.md
rename to docs/source/lang/articles/about/overview.md
diff --git a/docs/lang/articles/advanced/_category_.json b/docs/source/lang/articles/advanced/_category_.json
similarity index 100%
rename from docs/lang/articles/advanced/_category_.json
rename to docs/source/lang/articles/advanced/_category_.json
diff --git a/docs/lang/articles/advanced/argument_pack.md b/docs/source/lang/articles/advanced/argument_pack.md
similarity index 100%
rename from docs/lang/articles/advanced/argument_pack.md
rename to docs/source/lang/articles/advanced/argument_pack.md
diff --git a/docs/lang/articles/advanced/data_oriented_class.md b/docs/source/lang/articles/advanced/data_oriented_class.md
similarity index 100%
rename from docs/lang/articles/advanced/data_oriented_class.md
rename to docs/source/lang/articles/advanced/data_oriented_class.md
diff --git a/docs/lang/articles/advanced/dataclass.md b/docs/source/lang/articles/advanced/dataclass.md
similarity index 100%
rename from docs/lang/articles/advanced/dataclass.md
rename to docs/source/lang/articles/advanced/dataclass.md
diff --git a/docs/lang/articles/advanced/meta.md b/docs/source/lang/articles/advanced/meta.md
similarity index 100%
rename from docs/lang/articles/advanced/meta.md
rename to docs/source/lang/articles/advanced/meta.md
diff --git a/docs/lang/articles/advanced/odop.md b/docs/source/lang/articles/advanced/odop.md
similarity index 100%
rename from docs/lang/articles/advanced/odop.md
rename to docs/source/lang/articles/advanced/odop.md
diff --git a/docs/lang/articles/advanced/quant.md b/docs/source/lang/articles/advanced/quant.md
similarity index 100%
rename from docs/lang/articles/advanced/quant.md
rename to docs/source/lang/articles/advanced/quant.md
diff --git a/docs/lang/articles/basic/_category_.json b/docs/source/lang/articles/basic/_category_.json
similarity index 100%
rename from docs/lang/articles/basic/_category_.json
rename to docs/source/lang/articles/basic/_category_.json
diff --git a/docs/lang/articles/basic/external.md b/docs/source/lang/articles/basic/external.md
similarity index 100%
rename from docs/lang/articles/basic/external.md
rename to docs/source/lang/articles/basic/external.md
diff --git a/docs/lang/articles/basic/field.md b/docs/source/lang/articles/basic/field.md
similarity index 100%
rename from docs/lang/articles/basic/field.md
rename to docs/source/lang/articles/basic/field.md
diff --git a/docs/lang/articles/basic/layout.md b/docs/source/lang/articles/basic/layout.md
similarity index 100%
rename from docs/lang/articles/basic/layout.md
rename to docs/source/lang/articles/basic/layout.md
diff --git a/docs/lang/articles/basic/ndarray.md b/docs/source/lang/articles/basic/ndarray.md
similarity index 100%
rename from docs/lang/articles/basic/ndarray.md
rename to docs/source/lang/articles/basic/ndarray.md
diff --git a/docs/lang/articles/basic/offset.md b/docs/source/lang/articles/basic/offset.md
similarity index 100%
rename from docs/lang/articles/basic/offset.md
rename to docs/source/lang/articles/basic/offset.md
diff --git a/docs/lang/articles/basic/sparse.md b/docs/source/lang/articles/basic/sparse.md
similarity index 100%
rename from docs/lang/articles/basic/sparse.md
rename to docs/source/lang/articles/basic/sparse.md
diff --git a/docs/lang/articles/c-api/_category_.json b/docs/source/lang/articles/c-api/_category_.json
similarity index 100%
rename from docs/lang/articles/c-api/_category_.json
rename to docs/source/lang/articles/c-api/_category_.json
diff --git a/docs/lang/articles/c-api/taichi_core.md b/docs/source/lang/articles/c-api/taichi_core.md
similarity index 100%
rename from docs/lang/articles/c-api/taichi_core.md
rename to docs/source/lang/articles/c-api/taichi_core.md
diff --git a/docs/lang/articles/c-api/taichi_vulkan.md b/docs/source/lang/articles/c-api/taichi_vulkan.md
similarity index 100%
rename from docs/lang/articles/c-api/taichi_vulkan.md
rename to docs/source/lang/articles/c-api/taichi_vulkan.md
diff --git a/docs/lang/articles/contribution/_category_.json b/docs/source/lang/articles/contribution/_category_.json
similarity index 100%
rename from docs/lang/articles/contribution/_category_.json
rename to docs/source/lang/articles/contribution/_category_.json
diff --git a/docs/lang/articles/contribution/contributor_guide.md b/docs/source/lang/articles/contribution/contributor_guide.md
similarity index 100%
rename from docs/lang/articles/contribution/contributor_guide.md
rename to docs/source/lang/articles/contribution/contributor_guide.md
diff --git a/docs/lang/articles/contribution/dev_install.md b/docs/source/lang/articles/contribution/dev_install.md
similarity index 100%
rename from docs/lang/articles/contribution/dev_install.md
rename to docs/source/lang/articles/contribution/dev_install.md
diff --git a/docs/lang/articles/contribution/developer_utilities.md b/docs/source/lang/articles/contribution/developer_utilities.md
similarity index 100%
rename from docs/lang/articles/contribution/developer_utilities.md
rename to docs/source/lang/articles/contribution/developer_utilities.md
diff --git a/docs/lang/articles/contribution/development_tips.md b/docs/source/lang/articles/contribution/development_tips.md
similarity index 100%
rename from docs/lang/articles/contribution/development_tips.md
rename to docs/source/lang/articles/contribution/development_tips.md
diff --git a/docs/lang/articles/contribution/doc_writing.md b/docs/source/lang/articles/contribution/doc_writing.md
similarity index 100%
rename from docs/lang/articles/contribution/doc_writing.md
rename to docs/source/lang/articles/contribution/doc_writing.md
diff --git a/docs/lang/articles/contribution/style_guide_en.md b/docs/source/lang/articles/contribution/style_guide_en.md
similarity index 100%
rename from docs/lang/articles/contribution/style_guide_en.md
rename to docs/source/lang/articles/contribution/style_guide_en.md
diff --git a/docs/lang/articles/contribution/windows_debug.md b/docs/source/lang/articles/contribution/windows_debug.md
similarity index 100%
rename from docs/lang/articles/contribution/windows_debug.md
rename to docs/source/lang/articles/contribution/windows_debug.md
diff --git a/docs/lang/articles/contribution/write_test.md b/docs/source/lang/articles/contribution/write_test.md
similarity index 100%
rename from docs/lang/articles/contribution/write_test.md
rename to docs/source/lang/articles/contribution/write_test.md
diff --git a/docs/lang/articles/contribution/writing_cpp_tests.md b/docs/source/lang/articles/contribution/writing_cpp_tests.md
similarity index 100%
rename from docs/lang/articles/contribution/writing_cpp_tests.md
rename to docs/source/lang/articles/contribution/writing_cpp_tests.md
diff --git a/docs/lang/articles/debug/_category_.json b/docs/source/lang/articles/debug/_category_.json
similarity index 100%
rename from docs/lang/articles/debug/_category_.json
rename to docs/source/lang/articles/debug/_category_.json
diff --git a/docs/lang/articles/debug/debugging.md b/docs/source/lang/articles/debug/debugging.md
similarity index 100%
rename from docs/lang/articles/debug/debugging.md
rename to docs/source/lang/articles/debug/debugging.md
diff --git a/docs/lang/articles/deployment/_category_.json b/docs/source/lang/articles/deployment/_category_.json
similarity index 100%
rename from docs/lang/articles/deployment/_category_.json
rename to docs/source/lang/articles/deployment/_category_.json
diff --git a/docs/lang/articles/deployment/tutorial.md b/docs/source/lang/articles/deployment/tutorial.md
similarity index 100%
rename from docs/lang/articles/deployment/tutorial.md
rename to docs/source/lang/articles/deployment/tutorial.md
diff --git a/docs/lang/articles/differentiable/_category_.json b/docs/source/lang/articles/differentiable/_category_.json
similarity index 100%
rename from docs/lang/articles/differentiable/_category_.json
rename to docs/source/lang/articles/differentiable/_category_.json
diff --git a/docs/lang/articles/differentiable/differentiable_programming.md b/docs/source/lang/articles/differentiable/differentiable_programming.md
similarity index 100%
rename from docs/lang/articles/differentiable/differentiable_programming.md
rename to docs/source/lang/articles/differentiable/differentiable_programming.md
diff --git a/docs/lang/articles/faqs/_category_.json b/docs/source/lang/articles/faqs/_category_.json
similarity index 100%
rename from docs/lang/articles/faqs/_category_.json
rename to docs/source/lang/articles/faqs/_category_.json
diff --git a/docs/lang/articles/faqs/faq.md b/docs/source/lang/articles/faqs/faq.md
old mode 100755
new mode 100644
similarity index 100%
rename from docs/lang/articles/faqs/faq.md
rename to docs/source/lang/articles/faqs/faq.md
diff --git a/docs/lang/articles/faqs/install.md b/docs/source/lang/articles/faqs/install.md
similarity index 100%
rename from docs/lang/articles/faqs/install.md
rename to docs/source/lang/articles/faqs/install.md
diff --git a/docs/lang/articles/get-started/_category_.json b/docs/source/lang/articles/get-started/_category_.json
similarity index 100%
rename from docs/lang/articles/get-started/_category_.json
rename to docs/source/lang/articles/get-started/_category_.json
diff --git a/docs/lang/articles/get-started/accelerate_python.md b/docs/source/lang/articles/get-started/accelerate_python.md
similarity index 100%
rename from docs/lang/articles/get-started/accelerate_python.md
rename to docs/source/lang/articles/get-started/accelerate_python.md
diff --git a/docs/lang/articles/get-started/accelerate_pytorch.md b/docs/source/lang/articles/get-started/accelerate_pytorch.md
similarity index 100%
rename from docs/lang/articles/get-started/accelerate_pytorch.md
rename to docs/source/lang/articles/get-started/accelerate_pytorch.md
diff --git a/docs/lang/articles/get-started/cloth_simulation.md b/docs/source/lang/articles/get-started/cloth_simulation.md
similarity index 100%
rename from docs/lang/articles/get-started/cloth_simulation.md
rename to docs/source/lang/articles/get-started/cloth_simulation.md
diff --git a/docs/lang/articles/get-started/hello_world.md b/docs/source/lang/articles/get-started/hello_world.md
similarity index 100%
rename from docs/lang/articles/get-started/hello_world.md
rename to docs/source/lang/articles/get-started/hello_world.md
diff --git a/docs/lang/articles/glossary/_category_.json b/docs/source/lang/articles/glossary/_category_.json
similarity index 100%
rename from docs/lang/articles/glossary/_category_.json
rename to docs/source/lang/articles/glossary/_category_.json
diff --git a/docs/lang/articles/glossary/glossary.md b/docs/source/lang/articles/glossary/glossary.md
similarity index 100%
rename from docs/lang/articles/glossary/glossary.md
rename to docs/source/lang/articles/glossary/glossary.md
diff --git a/docs/lang/articles/internals/_category_.json b/docs/source/lang/articles/internals/_category_.json
similarity index 100%
rename from docs/lang/articles/internals/_category_.json
rename to docs/source/lang/articles/internals/_category_.json
diff --git a/docs/lang/articles/internals/compilation.md b/docs/source/lang/articles/internals/compilation.md
similarity index 100%
rename from docs/lang/articles/internals/compilation.md
rename to docs/source/lang/articles/internals/compilation.md
diff --git a/docs/lang/articles/internals/internal.md b/docs/source/lang/articles/internals/internal.md
similarity index 100%
rename from docs/lang/articles/internals/internal.md
rename to docs/source/lang/articles/internals/internal.md
diff --git a/docs/lang/articles/internals/life_of_kernel_lowres.jpg b/docs/source/lang/articles/internals/life_of_kernel_lowres.jpg
similarity index 100%
rename from docs/lang/articles/internals/life_of_kernel_lowres.jpg
rename to docs/source/lang/articles/internals/life_of_kernel_lowres.jpg
diff --git a/docs/lang/articles/kernels/_category_.json b/docs/source/lang/articles/kernels/_category_.json
similarity index 100%
rename from docs/lang/articles/kernels/_category_.json
rename to docs/source/lang/articles/kernels/_category_.json
diff --git a/docs/lang/articles/kernels/kernel_function.md b/docs/source/lang/articles/kernels/kernel_function.md
similarity index 100%
rename from docs/lang/articles/kernels/kernel_function.md
rename to docs/source/lang/articles/kernels/kernel_function.md
diff --git a/docs/lang/articles/kernels/kernel_sync.md b/docs/source/lang/articles/kernels/kernel_sync.md
similarity index 100%
rename from docs/lang/articles/kernels/kernel_sync.md
rename to docs/source/lang/articles/kernels/kernel_sync.md
diff --git a/docs/lang/articles/math/_category_.json b/docs/source/lang/articles/math/_category_.json
similarity index 100%
rename from docs/lang/articles/math/_category_.json
rename to docs/source/lang/articles/math/_category_.json
diff --git a/docs/lang/articles/math/linear_solver.md b/docs/source/lang/articles/math/linear_solver.md
similarity index 100%
rename from docs/lang/articles/math/linear_solver.md
rename to docs/source/lang/articles/math/linear_solver.md
diff --git a/docs/lang/articles/math/math_module.md b/docs/source/lang/articles/math/math_module.md
similarity index 100%
rename from docs/lang/articles/math/math_module.md
rename to docs/source/lang/articles/math/math_module.md
diff --git a/docs/lang/articles/math/sparse_matrix.md b/docs/source/lang/articles/math/sparse_matrix.md
similarity index 100%
rename from docs/lang/articles/math/sparse_matrix.md
rename to docs/source/lang/articles/math/sparse_matrix.md
diff --git a/docs/lang/articles/performance_tuning/_category_.json b/docs/source/lang/articles/performance_tuning/_category_.json
similarity index 100%
rename from docs/lang/articles/performance_tuning/_category_.json
rename to docs/source/lang/articles/performance_tuning/_category_.json
diff --git a/docs/lang/articles/performance_tuning/performance.md b/docs/source/lang/articles/performance_tuning/performance.md
similarity index 100%
rename from docs/lang/articles/performance_tuning/performance.md
rename to docs/source/lang/articles/performance_tuning/performance.md
diff --git a/docs/lang/articles/performance_tuning/profiler.md b/docs/source/lang/articles/performance_tuning/profiler.md
similarity index 100%
rename from docs/lang/articles/performance_tuning/profiler.md
rename to docs/source/lang/articles/performance_tuning/profiler.md
diff --git a/docs/lang/articles/reference/_category_.json b/docs/source/lang/articles/reference/_category_.json
similarity index 100%
rename from docs/lang/articles/reference/_category_.json
rename to docs/source/lang/articles/reference/_category_.json
diff --git a/docs/lang/articles/reference/differences_between_taichi_and_python_programs.md b/docs/source/lang/articles/reference/differences_between_taichi_and_python_programs.md
similarity index 100%
rename from docs/lang/articles/reference/differences_between_taichi_and_python_programs.md
rename to docs/source/lang/articles/reference/differences_between_taichi_and_python_programs.md
diff --git a/docs/lang/articles/reference/global_settings.md b/docs/source/lang/articles/reference/global_settings.md
similarity index 100%
rename from docs/lang/articles/reference/global_settings.md
rename to docs/source/lang/articles/reference/global_settings.md
diff --git a/docs/lang/articles/reference/language_reference.md b/docs/source/lang/articles/reference/language_reference.md
similarity index 100%
rename from docs/lang/articles/reference/language_reference.md
rename to docs/source/lang/articles/reference/language_reference.md
diff --git a/docs/lang/articles/reference/operator.md b/docs/source/lang/articles/reference/operator.md
similarity index 100%
rename from docs/lang/articles/reference/operator.md
rename to docs/source/lang/articles/reference/operator.md
diff --git a/docs/lang/articles/reference/simt.md b/docs/source/lang/articles/reference/simt.md
similarity index 100%
rename from docs/lang/articles/reference/simt.md
rename to docs/source/lang/articles/reference/simt.md
diff --git a/docs/lang/articles/reference/syntax_sugars.md b/docs/source/lang/articles/reference/syntax_sugars.md
similarity index 100%
rename from docs/lang/articles/reference/syntax_sugars.md
rename to docs/source/lang/articles/reference/syntax_sugars.md
diff --git a/docs/lang/articles/static/assets/aot_tutorial.png b/docs/source/lang/articles/static/assets/aot_tutorial.png
similarity index 100%
rename from docs/lang/articles/static/assets/aot_tutorial.png
rename to docs/source/lang/articles/static/assets/aot_tutorial.png
diff --git a/docs/lang/articles/static/assets/arrow_field.png b/docs/source/lang/articles/static/assets/arrow_field.png
similarity index 100%
rename from docs/lang/articles/static/assets/arrow_field.png
rename to docs/source/lang/articles/static/assets/arrow_field.png
diff --git a/docs/lang/articles/static/assets/arrows.png b/docs/source/lang/articles/static/assets/arrows.png
similarity index 100%
rename from docs/lang/articles/static/assets/arrows.png
rename to docs/source/lang/articles/static/assets/arrows.png
diff --git a/docs/lang/articles/static/assets/bitpacked_fields_layout_example.png b/docs/source/lang/articles/static/assets/bitpacked_fields_layout_example.png
similarity index 100%
rename from docs/lang/articles/static/assets/bitpacked_fields_layout_example.png
rename to docs/source/lang/articles/static/assets/bitpacked_fields_layout_example.png
diff --git a/docs/lang/articles/static/assets/bls_indices_mapping.png b/docs/source/lang/articles/static/assets/bls_indices_mapping.png
similarity index 100%
rename from docs/lang/articles/static/assets/bls_indices_mapping.png
rename to docs/source/lang/articles/static/assets/bls_indices_mapping.png
diff --git a/docs/lang/articles/static/assets/circles.png b/docs/source/lang/articles/static/assets/circles.png
similarity index 100%
rename from docs/lang/articles/static/assets/circles.png
rename to docs/source/lang/articles/static/assets/circles.png
diff --git a/docs/lang/articles/static/assets/colored_circles.png b/docs/source/lang/articles/static/assets/colored_circles.png
similarity index 100%
rename from docs/lang/articles/static/assets/colored_circles.png
rename to docs/source/lang/articles/static/assets/colored_circles.png
diff --git a/docs/lang/articles/static/assets/effect_of_offline_cache.png b/docs/source/lang/articles/static/assets/effect_of_offline_cache.png
similarity index 100%
rename from docs/lang/articles/static/assets/effect_of_offline_cache.png
rename to docs/source/lang/articles/static/assets/effect_of_offline_cache.png
diff --git a/docs/lang/articles/static/assets/floating-point_formats.png b/docs/source/lang/articles/static/assets/floating-point_formats.png
similarity index 100%
rename from docs/lang/articles/static/assets/floating-point_formats.png
rename to docs/source/lang/articles/static/assets/floating-point_formats.png
diff --git a/docs/lang/articles/static/assets/fractal.png b/docs/source/lang/articles/static/assets/fractal.png
similarity index 100%
rename from docs/lang/articles/static/assets/fractal.png
rename to docs/source/lang/articles/static/assets/fractal.png
diff --git a/docs/lang/articles/static/assets/lines.png b/docs/source/lang/articles/static/assets/lines.png
similarity index 100%
rename from docs/lang/articles/static/assets/lines.png
rename to docs/source/lang/articles/static/assets/lines.png
diff --git a/docs/lang/articles/static/assets/point_field.png b/docs/source/lang/articles/static/assets/point_field.png
similarity index 100%
rename from docs/lang/articles/static/assets/point_field.png
rename to docs/source/lang/articles/static/assets/point_field.png
diff --git a/docs/lang/articles/static/assets/quant_array_layout_example.png b/docs/source/lang/articles/static/assets/quant_array_layout_example.png
similarity index 100%
rename from docs/lang/articles/static/assets/quant_array_layout_example.png
rename to docs/source/lang/articles/static/assets/quant_array_layout_example.png
diff --git a/docs/lang/articles/static/assets/rect.png b/docs/source/lang/articles/static/assets/rect.png
similarity index 100%
rename from docs/lang/articles/static/assets/rect.png
rename to docs/source/lang/articles/static/assets/rect.png
diff --git a/docs/lang/articles/static/assets/runtime.png b/docs/source/lang/articles/static/assets/runtime.png
similarity index 100%
rename from docs/lang/articles/static/assets/runtime.png
rename to docs/source/lang/articles/static/assets/runtime.png
diff --git a/docs/lang/articles/static/assets/triangles.png b/docs/source/lang/articles/static/assets/triangles.png
similarity index 100%
rename from docs/lang/articles/static/assets/triangles.png
rename to docs/source/lang/articles/static/assets/triangles.png
diff --git a/docs/lang/articles/type_system/_category_.json b/docs/source/lang/articles/type_system/_category_.json
similarity index 100%
rename from docs/lang/articles/type_system/_category_.json
rename to docs/source/lang/articles/type_system/_category_.json
diff --git a/docs/lang/articles/type_system/type.md b/docs/source/lang/articles/type_system/type.md
similarity index 100%
rename from docs/lang/articles/type_system/type.md
rename to docs/source/lang/articles/type_system/type.md
diff --git a/docs/lang/articles/utilities/_category_.json b/docs/source/lang/articles/utilities/_category_.json
similarity index 100%
rename from docs/lang/articles/utilities/_category_.json
rename to docs/source/lang/articles/utilities/_category_.json
diff --git a/docs/lang/articles/visualization/_category_.json b/docs/source/lang/articles/visualization/_category_.json
similarity index 100%
rename from docs/lang/articles/visualization/_category_.json
rename to docs/source/lang/articles/visualization/_category_.json
diff --git a/docs/lang/articles/visualization/export_results.md b/docs/source/lang/articles/visualization/export_results.md
similarity index 100%
rename from docs/lang/articles/visualization/export_results.md
rename to docs/source/lang/articles/visualization/export_results.md
diff --git a/docs/lang/articles/visualization/ggui.md b/docs/source/lang/articles/visualization/ggui.md
similarity index 100%
rename from docs/lang/articles/visualization/ggui.md
rename to docs/source/lang/articles/visualization/ggui.md
diff --git a/docs/lang/articles/visualization/gui_system.md b/docs/source/lang/articles/visualization/gui_system.md
similarity index 100%
rename from docs/lang/articles/visualization/gui_system.md
rename to docs/source/lang/articles/visualization/gui_system.md
diff --git a/docs/rfcs/20220410-rfc-process.md b/docs/source/rfcs/20220410-rfc-process.md
similarity index 100%
rename from docs/rfcs/20220410-rfc-process.md
rename to docs/source/rfcs/20220410-rfc-process.md
diff --git a/docs/rfcs/20220413-aot-for-all-snode.md b/docs/source/rfcs/20220413-aot-for-all-snode.md
similarity index 100%
rename from docs/rfcs/20220413-aot-for-all-snode.md
rename to docs/source/rfcs/20220413-aot-for-all-snode.md
diff --git a/docs/rfcs/res/.gitkeep b/docs/source/rfcs/res/.gitkeep
similarity index 100%
rename from docs/rfcs/res/.gitkeep
rename to docs/source/rfcs/res/.gitkeep
diff --git a/docs/rfcs/yyyymmdd-rfc-template.md b/docs/source/rfcs/yyyymmdd-rfc-template.md
similarity index 100%
rename from docs/rfcs/yyyymmdd-rfc-template.md
rename to docs/source/rfcs/yyyymmdd-rfc-template.md
diff --git a/docs/variable.json b/docs/source/variable.json
similarity index 100%
rename from docs/variable.json
rename to docs/source/variable.json
diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
new file mode 100644
index 0000000000000..bf1dadd69b6b0
--- /dev/null
+++ b/docs/sphinx/_toc.yml.in
@@ -0,0 +1,29 @@
+defaults:
+  numbered: False
+  maxdepth: 6
+root: index
+subtrees:
+- entries:
+  - file: what-is-taichi.rst
+    title: What is Taichi Lang?
+
+- caption: Install
+  entries:
+  - file: install/taichi-install.rst
+    title: Install Taichi Lang
+
+- caption: Examples
+  entries:
+  - file: examples/taichi-get-started.rst 
+    title: Get started with Taichi Lang
+  - file: examples/taichi-examples.rst 
+    title: Run a Taichi Lang example
+
+- caption: About
+  entries:
+  - url: https://rocm.blogs.amd.com/artificial-intelligence/taichi_mi300x/README.html
+    title: ROCm Taichi Lang blog
+  - url: https://rocm.docs.amd.com/en/latest/contribute/contributing.html
+    title: Contribute to Taichi Lang
+  - file: about/license.rst
+    title: License
diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
new file mode 100644
index 0000000000000..63017d30fae98
--- /dev/null
+++ b/docs/sphinx/requirements.in
@@ -0,0 +1 @@
+rocm-docs-core==1.30.1
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
new file mode 100644
index 0000000000000..eb5559591ac59
--- /dev/null
+++ b/docs/sphinx/requirements.txt
@@ -0,0 +1,276 @@
+#
+# This file is autogenerated by pip-compile with Python 3.12
+# by the following command:
+#
+#    pip-compile sphinx/requirements.in
+#
+accessible-pygments==0.0.5
+    # via pydata-sphinx-theme
+alabaster==1.0.0
+    # via sphinx
+asttokens==3.0.1
+    # via stack-data
+attrs==25.4.0
+    # via
+    #   jsonschema
+    #   jupyter-cache
+    #   referencing
+babel==2.17.0
+    # via
+    #   pydata-sphinx-theme
+    #   sphinx
+beautifulsoup4==4.14.2
+    # via pydata-sphinx-theme
+breathe==4.36.0
+    # via rocm-docs-core
+certifi==2025.11.12
+    # via requests
+cffi==2.0.0
+    # via
+    #   cryptography
+    #   pynacl
+charset-normalizer==3.4.4
+    # via requests
+click==8.3.1
+    # via
+    #   jupyter-cache
+    #   sphinx-external-toc
+colorama==0.4.6
+    # via
+    #   click
+    #   ipython
+    #   sphinx
+comm==0.2.3
+    # via ipykernel
+cryptography==46.0.3
+    # via pyjwt
+debugpy==1.8.17
+    # via ipykernel
+decorator==5.2.1
+    # via ipython
+docutils==0.21.2
+    # via
+    #   myst-parser
+    #   pydata-sphinx-theme
+    #   sphinx
+executing==2.2.1
+    # via stack-data
+fastjsonschema==2.21.2
+    # via
+    #   nbformat
+    #   rocm-docs-core
+gitdb==4.0.12
+    # via gitpython
+gitpython==3.1.45
+    # via rocm-docs-core
+greenlet==3.2.4
+    # via sqlalchemy
+idna==3.11
+    # via requests
+imagesize==1.4.1
+    # via sphinx
+importlib-metadata==8.7.0
+    # via
+    #   jupyter-cache
+    #   myst-nb
+ipykernel==7.1.0
+    # via myst-nb
+ipython==9.7.0
+    # via
+    #   ipykernel
+    #   myst-nb
+ipython-pygments-lexers==1.1.1
+    # via ipython
+jedi==0.19.2
+    # via ipython
+jinja2==3.1.6
+    # via
+    #   myst-parser
+    #   sphinx
+jsonschema==4.25.1
+    # via nbformat
+jsonschema-specifications==2025.9.1
+    # via jsonschema
+jupyter-cache==1.0.1
+    # via myst-nb
+jupyter-client==8.6.3
+    # via
+    #   ipykernel
+    #   nbclient
+jupyter-core==5.9.1
+    # via
+    #   ipykernel
+    #   jupyter-client
+    #   nbclient
+    #   nbformat
+markdown-it-py==3.0.0
+    # via
+    #   mdit-py-plugins
+    #   myst-parser
+markupsafe==3.0.3
+    # via jinja2
+matplotlib-inline==0.2.1
+    # via
+    #   ipykernel
+    #   ipython
+mdit-py-plugins==0.5.0
+    # via myst-parser
+mdurl==0.1.2
+    # via markdown-it-py
+myst-nb==1.3.0
+    # via rocm-docs-core
+myst-parser==4.0.1
+    # via myst-nb
+nbclient==0.10.2
+    # via
+    #   jupyter-cache
+    #   myst-nb
+nbformat==5.10.4
+    # via
+    #   jupyter-cache
+    #   myst-nb
+    #   nbclient
+nest-asyncio==1.6.0
+    # via ipykernel
+packaging==25.0
+    # via
+    #   ipykernel
+    #   sphinx
+parso==0.8.5
+    # via jedi
+platformdirs==4.5.0
+    # via jupyter-core
+prompt-toolkit==3.0.52
+    # via ipython
+psutil==7.1.3
+    # via ipykernel
+pure-eval==0.2.3
+    # via stack-data
+pycparser==2.23
+    # via cffi
+pydata-sphinx-theme==0.16.1
+    # via
+    #   rocm-docs-core
+    #   sphinx-book-theme
+pygithub==2.8.1
+    # via rocm-docs-core
+pygments==2.19.2
+    # via
+    #   accessible-pygments
+    #   ipython
+    #   ipython-pygments-lexers
+    #   pydata-sphinx-theme
+    #   sphinx
+pyjwt[crypto]==2.10.1
+    # via pygithub
+pynacl==1.6.1
+    # via pygithub
+python-dateutil==2.9.0.post0
+    # via jupyter-client
+pyyaml==6.0.3
+    # via
+    #   jupyter-cache
+    #   myst-nb
+    #   myst-parser
+    #   rocm-docs-core
+    #   sphinx-external-toc
+pyzmq==27.1.0
+    # via
+    #   ipykernel
+    #   jupyter-client
+referencing==0.37.0
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+requests==2.32.5
+    # via
+    #   pygithub
+    #   sphinx
+rocm-docs-core==1.30.1
+    # via -r sphinx/requirements.in
+roman-numerals-py==3.1.0
+    # via sphinx
+rpds-py==0.29.0
+    # via
+    #   jsonschema
+    #   referencing
+six==1.17.0
+    # via python-dateutil
+smmap==5.0.2
+    # via gitdb
+snowballstemmer==3.0.1
+    # via sphinx
+soupsieve==2.8
+    # via beautifulsoup4
+sphinx==8.2.3
+    # via
+    #   breathe
+    #   myst-nb
+    #   myst-parser
+    #   pydata-sphinx-theme
+    #   rocm-docs-core
+    #   sphinx-book-theme
+    #   sphinx-copybutton
+    #   sphinx-design
+    #   sphinx-external-toc
+    #   sphinx-notfound-page
+    #   sphinx-reredirects
+sphinx-book-theme==1.1.3
+    # via rocm-docs-core
+sphinx-copybutton==0.5.2
+    # via rocm-docs-core
+sphinx-design==0.6.1
+    # via rocm-docs-core
+sphinx-external-toc==1.0.1
+    # via rocm-docs-core
+sphinx-notfound-page==1.1.0
+    # via rocm-docs-core
+sphinx-reredirects==1.0.0
+    # via -r sphinx/requirements.in
+sphinxcontrib-applehelp==2.0.0
+    # via sphinx
+sphinxcontrib-devhelp==2.0.0
+    # via sphinx
+sphinxcontrib-htmlhelp==2.1.0
+    # via sphinx
+sphinxcontrib-jsmath==1.0.1
+    # via sphinx
+sphinxcontrib-qthelp==2.0.0
+    # via sphinx
+sphinxcontrib-serializinghtml==2.0.0
+    # via sphinx
+sqlalchemy==2.0.44
+    # via jupyter-cache
+stack-data==0.6.3
+    # via ipython
+tabulate==0.9.0
+    # via jupyter-cache
+tornado==6.5.2
+    # via
+    #   ipykernel
+    #   jupyter-client
+traitlets==5.14.3
+    # via
+    #   ipykernel
+    #   ipython
+    #   jupyter-client
+    #   jupyter-core
+    #   matplotlib-inline
+    #   nbclient
+    #   nbformat
+typing-extensions==4.15.0
+    # via
+    #   beautifulsoup4
+    #   myst-nb
+    #   pydata-sphinx-theme
+    #   pygithub
+    #   referencing
+    #   sqlalchemy
+urllib3==2.5.0
+    # via
+    #   pygithub
+    #   requests
+wcwidth==0.2.14
+    # via prompt-toolkit
+zipp==3.23.0
+    # via importlib-metadata
diff --git a/docs/what-is-taichi.rst b/docs/what-is-taichi.rst
new file mode 100644
index 0000000000000..7165ffe4d6f2a
--- /dev/null
+++ b/docs/what-is-taichi.rst
@@ -0,0 +1,55 @@
+.. meta::
+  :description: What is Taichi Lang?
+  :keywords: Taichi Lang, python, programming, JIT, AMD, ROCm, overview, introduction
+
+.. _what-is-taichi:
+
+********************************************************************
+What is Taichi Lang?
+********************************************************************
+
+Taichi Lang is an open-source, imperative, and parallel programming language embedded
+in Python, designed for high-performance numerical computation and real-time physical
+simulation. It uses just-in-time (JIT) compilation frameworks such as LLVM to accelerate
+compute-intensive Python code by compiling it into optimized GPU or CPU instructions.
+
+Features and use cases
+====================================================================
+
+Taichi Lang allows developers to write concise, high-level algorithms while leaving
+performance optimization to Taichi’s compiler. Taichi Lang is widely used in domains such
+as fluid dynamics, particle-based simulations, robotics, computer vision, augmented
+reality, artificial intelligence, and visual effects for film and gaming. For example,
+simulating a cloth falling onto a sphere, a system with tens of thousands of mass
+points and springs, can be implemented in Taichi Lang with only a few dozen lines of
+Python code due to its data-oriented design and automatic parallelization.
+
+Taichi Lang also supports advanced techniques such as hierarchical sparse voxel grids for
+large-scale simulations, enabling efficient handling of spatially sparse data structures
+in 3D visual computing. On ROCm, Taichi Lang is officially supported on AMD Instinct
+GPUs, making it a powerful tool for developers who need both flexibility and performance.
+
+
+Why Taichi Lang?
+====================================================================
+
+- **Built around Python**: Taichi Lang shares almost the same syntax with Python, 
+  which lets you write algorithms with minimal syntax differences. It is also well
+  integrated into the Python ecosystem, including NumPy and PyTorch.
+
+- **Flexibility**: Taichi Lang provides a set of generic data containers known 
+  as ``SNode``, an effective mechanism for composing hierarchical, 
+  multi-dimensional fields. This can cover many use patterns in numerical simulation
+  (for example, `spatially sparse computing <https://docs.taichi-lang.org/docs/sparse>`__).
+
+- **Performance**: With the ``ti.kernel`` decorator, Taichi Lang's JIT compiler 
+  automatically compiles your Python functions into efficient GPU or CPU machine
+  code for parallel execution.
+
+- **Portability**: Write your code once and run it everywhere. Taichi 
+  Lang supports most mainstream GPU APIs, such as NVIDIA CUDA and Vulkan. You can
+  write your code with Taichi Lang on ROCm and use it extensibly.
+
+- **Additional feature support**: Cross-platform support that includes a Vulkan-based 3D visualizer,
+  `differentiable programming <https://docs.taichi-lang.org/docs/differentiable_programming>`__,
+  `quantized computation <https://github.com/taichi-dev/quantaichi>`__ (experimental), and many more.
diff --git a/readthedocs.yaml b/readthedocs.yaml
new file mode 100644
index 0000000000000..3db285f6f772c
--- /dev/null
+++ b/readthedocs.yaml
@@ -0,0 +1,14 @@
+version: 2
+
+sphinx:
+  configuration: docs/conf.py
+
+python:
+  install:
+    - requirements: docs/sphinx/requirements.txt
+
+build:
+   os: ubuntu-24.04
+   tools:
+      python: "3.12"
+

From 36c0aa529d5c399f46619e617f553efa4f69b782 Mon Sep 17 00:00:00 2001
From: tmm77 <6461957+tmm77@users.noreply.github.com>
Date: Thu, 11 Dec 2025 13:49:25 -0500
Subject: [PATCH 39/42] removing rocm_setup_version

---
 CMakeLists.txt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 54eadfb97b798..336eaf8882b28 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,9 +1,6 @@
 #*********************************************************************
 # The Taichi Programming Language
 #*********************************************************************
-
-rocm_setup_version(VERSION 1.8.0b2)
-
 cmake_minimum_required(VERSION 3.17)
 
 project(taichi)

From 7c446fbd31baa4f39741c6975902d34fc5d33d06 Mon Sep 17 00:00:00 2001
From: anisha-amd <anisha.sankar@amd.com>
Date: Mon, 9 Feb 2026 12:29:44 -0500
Subject: [PATCH 40/42] Update taichi-install.rst

---
 docs/install/taichi-install.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/install/taichi-install.rst b/docs/install/taichi-install.rst
index 9d26d7b203fc5..45f9c1cbe608d 100644
--- a/docs/install/taichi-install.rst
+++ b/docs/install/taichi-install.rst
@@ -112,7 +112,7 @@ This command will automatically download and install the appropriate ``.whl`` fi
 
 .. code-block:: bash
 
-   pip install amd-taichi==1.8.0b2 --extra-index-url=https://pypi.amd.com/simple
+   pip install amd-taichi==1.8.0b2 --index-url=https://pypi.amd.com/simple
    sudo apt-get update
    sudo apt-get install -y lld
 

From f47d1b87457843bf31f0793726b3c4c65e597ad1 Mon Sep 17 00:00:00 2001
From: Tiffany Mintz <tiffany.mintz@amd.com>
Date: Mon, 6 Apr 2026 22:15:53 +0000
Subject: [PATCH 41/42] removing pull_request.yml for security concerns

---
 .github/workflows/pull_request.yml | 30 ------------------------------
 1 file changed, 30 deletions(-)
 delete mode 100644 .github/workflows/pull_request.yml

diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
deleted file mode 100644
index ed2da851c6ed3..0000000000000
--- a/.github/workflows/pull_request.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-name: Presubmit Title Checks
-on:
-  pull_request_target:
-    types: [opened, synchronize, reopened, edited]
-
-jobs:
-  pre_submit:
-    name: Presubmit Title Checks
-    if: ${{ github.event.pull_request }}
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-      - uses: actions/setup-python@v5
-        with:
-          python-version: 3.13
-      - name: Install Dependencies
-        run: pip install semver GitPython PyGithub
-
-      - name: Run PR Title Checker
-        run: |
-          python misc/ci_check_pr_title.py "$PR_TITLE"
-        env:
-          PR_TITLE: ${{ github.event.pull_request.title }}
-
-      # - name: PR Project Card Creation
-      #   if: github.event.action == 'opened' || github.event.action == 'edited'
-      #   run: python misc/ci_create_pr_card.py
-      #   env:
-      #     GITHUB_TOKEN: ${{ secrets.GARDENER_PAT }}
-      #     GH_EVENT: ${{ toJson(github.event) }}

From 440fcc2adc39b7e7f3fe1971abbd61abc1da1c80 Mon Sep 17 00:00:00 2001
From: tmm77 <6461957+tmm77@users.noreply.github.com>
Date: Mon, 4 May 2026 11:27:45 -0400
Subject: [PATCH 42/42] Delete ci/assets/mitm-ca.crt

This is to address AMD security concerns
---
 ci/assets/mitm-ca.crt | 25 -------------------------
 1 file changed, 25 deletions(-)
 delete mode 100644 ci/assets/mitm-ca.crt

diff --git a/ci/assets/mitm-ca.crt b/ci/assets/mitm-ca.crt
deleted file mode 100644
index 35ac8064ade73..0000000000000
--- a/ci/assets/mitm-ca.crt
+++ /dev/null
@@ -1,25 +0,0 @@
------BEGIN CERTIFICATE-----
-MIIESTCCAzGgAwIBAgIUGVTxoAKRFazNmE0SGx7FMbAIeVgwDQYJKoZIhvcNAQEL
-BQAwgbMxCzAJBgNVBAYTAkNOMRAwDgYDVQQIDAdCZWlqaW5nMRgwFgYDVQQKDA9U
-YWljaGkgR3JhcGhpY3MxGjAYBgNVBAsMEVRhaWNoaSBMYW5nIENJL0NEMTQwMgYD
-VQQDDCtUYWljaGkgQnVpbGRib3RzIE1JVE0gQ2VydGlmaWNhdGUgQXV0aG9yaXR5
-MSYwJAYJKoZIhvcNAQkBFhdiaW53YW5nQHRhaWNoaS5ncmFwaGljczAeFw0yMjEx
-MDkwODU2MDRaFw0zMjExMDYwODU2MDRaMIGzMQswCQYDVQQGEwJDTjEQMA4GA1UE
-CAwHQmVpamluZzEYMBYGA1UECgwPVGFpY2hpIEdyYXBoaWNzMRowGAYDVQQLDBFU
-YWljaGkgTGFuZyBDSS9DRDE0MDIGA1UEAwwrVGFpY2hpIEJ1aWxkYm90cyBNSVRN
-IENlcnRpZmljYXRlIEF1dGhvcml0eTEmMCQGCSqGSIb3DQEJARYXYmlud2FuZ0B0
-YWljaGkuZ3JhcGhpY3MwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQCo
-l0vo88tMrh5pOe7YUuGxqP0CE78qaESvHNt3L8fG4C9skHzI9uBQDkKyp8vTi6hu
-luTsPJ2wF59ok8q/RCu0fDT3k1myuBRVsxg7VB7ekubdsfxgyJIqd5c2GByf3+rD
-S/emPNZGCZ6VOrBapJadbPawiCO+NBMQZkFAMK2rDfPii/fxmssZ5DF75PHE6BON
-oHOUv7gz6FDOQn1w6VacSn1xNdWtQRo974HtIJs8ODoi4e4Gt1HkS0GxTQ0sLQad
-wxcGTaIJafrYizhTO3xCo/57o7ckS3Hd4rX21C1atLnlhPAPJ93MbVwMSWJfGPIA
-RH92Wj6psI7ryXsgD6XrAgMBAAGjUzBRMB0GA1UdDgQWBBQompdQJjwjq49IJPOi
-Gwfh3g5mizAfBgNVHSMEGDAWgBQompdQJjwjq49IJPOiGwfh3g5mizAPBgNVHRMB
-Af8EBTADAQH/MA0GCSqGSIb3DQEBCwUAA4IBAQARYA5Ul/WITZf3m2fxp4yNrs3p
-zBpQdqS8O8Y/0U3kvNBSRvXUzThHnzUSC47M1lpXpDS7OVBA3gmpWNG9hsi1BVlh
-q44amtZWNWMzPI7KtxC/BcxMER06iBv/3NdIL2gvAW8+mrGbT5UxjiKRFqgSpCeN
-A6/NxlpG2tiMRsRx7l7OnybPvmxP9/hx+3OFq0T2gHbOtdmvggIp4Oe4O8fQUGOj
-x0Ug2nEqzY/XYp8uj4NUge/3HGDFgOkGK+hRkVDVWOY3GE3yGJx+46ifDRChYGuU
-rugHTLTg1YmbYAH2HtwwvPSPWXn/b50oSNXeiiW/Y88GLXKK7TWlVbVQKNAT
------END CERTIFICATE-----