diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
deleted file mode 100644
index ed2da851c6ed3..0000000000000
--- a/.github/workflows/pull_request.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-name: Presubmit Title Checks
-on:
-  pull_request_target:
-    types: [opened, synchronize, reopened, edited]
-
-jobs:
-  pre_submit:
-    name: Presubmit Title Checks
-    if: ${{ github.event.pull_request }}
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-      - uses: actions/setup-python@v5
-        with:
-          python-version: 3.13
-      - name: Install Dependencies
-        run: pip install semver GitPython PyGithub
-
-      - name: Run PR Title Checker
-        run: |
-          python misc/ci_check_pr_title.py "$PR_TITLE"
-        env:
-          PR_TITLE: ${{ github.event.pull_request.title }}
-
-      # - name: PR Project Card Creation
-      #   if: github.event.action == 'opened' || github.event.action == 'edited'
-      #   run: python misc/ci_create_pr_card.py
-      #   env:
-      #     GITHUB_TOKEN: ${{ secrets.GARDENER_PAT }}
-      #     GH_EVENT: ${{ toJson(github.event) }}
diff --git a/.github/workflows/scripts/ti_build/compiler.py b/.github/workflows/scripts/ti_build/compiler.py
index b09ddea9da3c3..bba1983469636 100644
--- a/.github/workflows/scripts/ti_build/compiler.py
+++ b/.github/workflows/scripts/ti_build/compiler.py
@@ -33,7 +33,7 @@ def setup_clang(as_compiler=True) -> None:
     """
     u = platform.uname()
     if u.system == "Linux":
-        for v in ("", "-14", "-13", "-12", "-11", "-10"):
+        for v in ("", "-20", "-19", "-18", "-17", "-16", "-15", "-14", "-13", "-12", "-11", "-10"):
             clang = shutil.which(f"clang{v}")
             if clang is not None:
                 clangpp = shutil.which(f"clang++{v}")
diff --git a/.github/workflows/scripts/ti_build/entry.py b/.github/workflows/scripts/ti_build/entry.py
index 5c7613e65822b..570dc995dee13 100644
--- a/.github/workflows/scripts/ti_build/entry.py
+++ b/.github/workflows/scripts/ti_build/entry.py
@@ -82,8 +82,8 @@ def setup_basic_build_env():
         setup_clang()
 
     setup_llvm()
-    if u.system in ("Linux", "Windows"):
-        # We support & test Vulkan shader debug printf on Linux && Windows
+    if u.system == "Linux":
+        # We support & test Vulkan shader debug printf on Linux
         # This is done through the validation layer
         from .vulkan import setup_vulkan
 
diff --git a/.github/workflows/scripts/ti_build/llvm.py b/.github/workflows/scripts/ti_build/llvm.py
index e487a560b4a03..43376cd3875a6 100644
--- a/.github/workflows/scripts/ti_build/llvm.py
+++ b/.github/workflows/scripts/ti_build/llvm.py
@@ -1,3 +1,4 @@
+
 # -*- coding: utf-8 -*-
 
 # -- stdlib --
@@ -19,18 +20,25 @@ def setup_llvm() -> None:
     Download and install LLVM.
     """
     u = platform.uname()
-    if u.system == "Linux":
+    if (u.system, u.machine) == ("Linux", "x86_64"):
         if cmake_args.get_effective("TI_WITH_AMDGPU"):
-            out = get_cache_home() / "llvm15-amdgpu-005"
-            url = "https://github.com/GaleSeLee/assets/releases/download/v0.0.5/taichi-llvm-15.0.0-linux.zip"
+            # We should use LLVM toolchains shipped with OS.
+            os.environ["LLVM_DIR"] = os.environ["LLVM_PATH"]+"/lib/cmake"
+            os.environ["CPATH"] = os.environ["ROCM_PATH"]+"/include"
         elif is_manylinux2014():
             # FIXME: prebuilt llvm15 on ubuntu didn't work on manylinux2014 image of centos. Once that's fixed, remove this hack.
             out = get_cache_home() / "llvm15-manylinux2014"
             url = "https://github.com/ailzhang/torchhub_example/releases/download/0.3/taichi-llvm-15-linux.zip"
+            download_dep(url, out, strip=1)
         else:
             out = get_cache_home() / "llvm15"
             url = "https://github.com/taichi-dev/taichi_assets/releases/download/llvm15/taichi-llvm-15-linux.zip"
-        download_dep(url, out, strip=1)
+            download_dep(url, out, strip=1)
+
+    elif (u.system, u.machine) in (("Linux", "arm64"), ("Linux", "aarch64")):
+        out = get_cache_home() / "llvm15-manylinux2014"
+        # FIXME: ARM LLVM!
+        pass
     elif (u.system, u.machine) == ("Darwin", "arm64"):
         out = get_cache_home() / "llvm15-m1-nozstd"
         url = "https://github.com/taichi-dev/taichi_assets/releases/download/llvm15/taichi-llvm-15-m1-nozstd.zip"
@@ -47,5 +55,8 @@ def setup_llvm() -> None:
         raise RuntimeError(f"Unsupported platform: {u.system} {u.machine}")
 
     # We should use LLVM toolchains shipped with OS.
-    # path_prepend('PATH', out / 'bin')
-    os.environ["LLVM_DIR"] = str(out)
+     #path_prepend('PATH', out / 'bin')
+    if ((u.system, u.machine) not in (("Linux", "arm64"), ("Linux", "aarch64"))) and not (cmake_args.get_effective("TI_WITH_AMDGPU")):
+        os.environ["LLVM_DIR"] = "/usr/lib/llvm-20/cmake"
+        os.environ["CUDA_HOME"] = "/usr/local/cuda"
+        os.environ["CPATH"] = "/usr/local/cuda/include"
diff --git a/.wordlist.txt b/.wordlist.txt
new file mode 100644
index 0000000000000..c039ac3b71a1a
--- /dev/null
+++ b/.wordlist.txt
@@ -0,0 +1,865 @@
+AAC
+ABI
+ACE
+ACEs
+ACS
+AccVGPR
+AccVGPRs
+ALU
+AMD
+AMDGPU
+AMDGPUs
+AMDMIGraphX
+AMI
+AOCC
+AOMP
+AOTriton
+APBDIS
+APIC
+APIs
+APU
+ASIC
+ASICs
+ASan
+ASAN
+ASm
+ATI
+AddressSanitizer
+AlexNet
+Andrej
+Arb
+Autocast
+BARs
+BLAS
+BMC
+Blit
+Blockwise
+Bluefield
+Bootloader
+CCD
+CDNA
+CHTML
+CIFAR
+CLI
+CLion
+CMake
+CMakeLists
+CMakePackage
+CP
+CPC
+CPF
+CPP
+CPU
+CPUs
+Cron
+CSC
+CSE
+CSV
+CSn
+CTest
+CTests
+CU
+CUDA
+CUs
+CXX
+Cavium
+CentOS
+ChatGPT
+CoRR
+Codespaces
+Commitizen
+CommonMark
+Concretized
+Conda
+ConnectX
+CuPy
+Dashboarding
+DDR
+DF
+DGEMM
+DIMM
+DKMS
+DL
+DMA
+DNN
+DNNL
+DPM
+DRI
+DW
+DWORD
+Dask
+DataFrame
+DataLoader
+DataParallel
+Debian
+DeepSpeed
+Dependabot
+Deprecations
+DevCap
+Dockerfile
+Doxygen
+ELMo
+ENDPGM
+EPYC
+ESXi
+EoS
+FBGEMM
+FFT
+FFTs
+FFmpeg
+FHS
+FIXME
+FMA
+FP
+FX
+Filesystem
+FindDb
+Flang
+Fortran
+Fuyu
+GALB
+GCC
+GCD
+GCDs
+GCN
+GDB
+GDDR
+GDR
+GDS
+GEMM
+GEMMs
+GFortran
+GiB
+GIM
+GL
+GLXT
+Gloo
+GMI
+GPG
+GPR
+GPT
+GPU
+GPU's
+GPUs
+GRBM
+GenAI
+GenZ
+GitHub
+Gitpod
+HBM
+HCA
+HGX
+HIPCC
+HIPExtension
+HIPIFY
+HIPification
+HIPify
+HPC
+HPCG
+HPE
+HPL
+HSA
+HW
+HWE
+HWS
+Haswell
+Higgs
+Hyperparameters
+Huggingface
+ICD
+ICV
+IDE
+IDEs
+IFWI
+IMDb
+IOMMU
+IOP
+IOPM
+IOV
+IRQ
+ISA
+ISV
+ISVs
+ITL
+ImageNet
+InfiniBand
+Inlines
+IntelliSense
+Interop
+Intersphinx
+Intra
+Ioffe
+JAX's
+Jinja
+JSON
+Jupyter
+KFD
+KFDTest
+KMD
+KV
+KVM
+Karpathy's
+KiB
+Keras
+Khronos
+LAPACK
+LCLK
+LDS
+LLM
+LLMs
+LLVM
+LM
+LSAN
+LSan
+LTS
+LoRA
+MEM
+MERCHANTABILITY
+MFMA
+MiB
+MIGraphX
+MIOpen
+MIOpenGEMM
+MIOpen's
+MIVisionX
+MLM
+MMA
+MMIO
+MMIOH
+MMU
+MNIST
+MPI
+MSVC
+MVAPICH
+MVFFR
+Makefile
+Makefiles
+Matplotlib
+Matrox
+Megatrends
+Megatron
+Mellanox
+Mellanox's
+Meta's
+Miniconda
+MirroredStrategy
+Mixtral
+Multicore
+Multithreaded
+MyEnvironment
+MyST
+NBIO
+NBIOs
+NCCL
+NCF
+NIC
+NICs
+NLI
+NLP
+NPKit
+NPS
+NSP
+NUMA
+NVCC
+NVIDIA
+NVPTX
+NaN
+Nano
+Navi
+Noncoherently
+NousResearch's
+NumPy
+OAM
+OAMs
+OCP
+OEM
+OFED
+OMM
+OMP
+OMPI
+OMPT
+OMPX
+ONNX
+OSS
+OSU
+OpenCL
+OpenCV
+OpenFabrics
+OpenGL
+OpenMP
+OpenMPI
+OpenSSL
+OpenVX
+OpenXLA
+Oversubscription
+PagedAttention
+Pallas
+PCC
+PCI
+PCIe
+PEFT
+PEQT
+PIL
+PILImage
+POR
+PRNG
+PRs
+PaLM
+Pageable
+PeerDirect
+PerfDb
+Perfetto
+PipelineParallel
+PnP
+PowerEdge
+PowerShell
+Profiler's
+PyPi
+Pytest
+PyTorch
+Qcycles
+Qwen
+RAII
+RAS
+RCCL
+RDC
+RDC's
+RDMA
+RDNA
+README
+RHEL
+RMW
+RNN
+RNNs
+ROC
+ROCProfiler
+ROCT
+ROCTracer
+ROCclr
+ROCdbgapi
+ROCgdb
+ROCk
+ROCm
+ROCmCC
+ROCmSoftwarePlatform
+ROCmValidationSuite
+ROCprofiler
+ROCr
+RPP
+RST
+RW
+Radeon
+RelWithDebInfo
+Req
+Rickle
+RoCE
+Runfile
+Ryzen
+SALU
+SBIOS
+SCA
+SDK
+SDMA
+SDPA
+SDRAM
+SENDMSG
+SGPR
+SGPRs
+SHA
+SHARK's
+SIGQUIT
+SIMD
+SIMDs
+SKU
+SKUs
+SLES
+SMEM
+SMI
+SMT
+SPI
+SQs
+SRAM
+SRAMECC
+SVD
+SWE
+SerDes
+ShareGPT
+Shlens
+Skylake
+Softmax
+Spack
+SplitK
+Supermicro
+Szegedy
+TCA
+TCC
+TCI
+TCIU
+TCP
+TCR
+TF
+TFLOPS
+TP
+TPS
+TPU
+TPUs
+TSME
+Tagram
+TensileLite
+TensorBoard
+TensorFlow
+TensorParallel
+ToC
+TorchAudio
+torchaudio
+TorchElastic
+TorchMIGraphX
+torchrec
+TorchScript
+TorchServe
+torchserve
+torchtext
+TorchVision
+TransferBench
+TrapStatus
+UAC
+UC
+UCC
+UCX
+UE
+UIF
+UMC
+USM
+UTCL
+UTIL
+Uncached
+Unittests
+Unhandled
+VALU
+VBIOS
+VGPR
+VGPRs
+VM
+VMEM
+VMWare
+VRAM
+VSIX
+VSkipped
+Vanhoucke
+Vulkan
+WGP
+WGPs
+WX
+WikiText
+Wojna
+Workgroups
+Writebacks
+XCD
+XCDs
+XGBoost
+XGBoost's
+XGMI
+XT
+XTX
+Xeon
+Xilinx
+Xnack
+Xteam
+YAML
+YML
+YModel
+ZeRO
+ZenDNN
+accuracies
+activations
+addr
+alloc
+allocatable
+allocator
+allocators
+amdgpu
+api
+atmi
+atomics
+autogenerated
+autotune
+avx
+awk
+backend
+backends
+benchmarked
+benchmarking
+bfloat
+bilinear
+bitcode
+bitsandbytes
+blit
+bootloader
+boson
+bosons
+br
+buildable
+bursty
+bzip
+cacheable
+cd
+centos
+centric
+changelog
+chiplet
+cmake
+cmd
+coalescable
+codename
+collater
+comgr
+completers
+composable
+concretization
+config
+conformant
+constructible
+convolutional
+convolves
+copyable
+cpp
+csn
+cuBLAS
+cuda
+cuDNN
+cudnn
+cuFFT
+cuLIB
+cuRAND
+cuSOLVER
+cuSPARSE
+customizations
+cTDP
+dataset
+datasets
+dataspace
+datatype
+datatypes
+dbgapi
+de
+deallocation
+debuggability
+debian
+denoise
+denoised
+denoises
+denormalize
+dequantization
+dequantizes
+deserializers
+detections
+dev
+devicelibs
+devsel
+dimensionality
+disambiguates
+distro
+distros
+dkms
+dtype
+el
+embeddings
+enablement
+encodings
+endpgm
+enqueue
+env
+epilog
+etcetera
+ethernet
+exascale
+executables
+ffmpeg
+filesystem
+fortran
+fp
+gRPC
+galb
+gcc
+gdb
+gfortran
+gfx
+githooks
+github
+globals
+gnupg
+grayscale
+gzip
+heterogenous
+hipBLAS
+hipBLASLt
+hipBLASLt's
+hipblaslt
+hipCUB
+hipFFT
+hipFORT
+hipLIB
+hipRAND
+hipSOLVER
+hipSPARSE
+hipSPARSELt
+hipTensor
+hipamd
+hipblas
+hipcub
+hipfft
+hipfort
+hipify
+hipsolver
+hipsparse
+hlist
+hotspotting
+hpc
+hpp
+hsa
+hsakmt
+hyperparameter
+hyperparameters
+iDRAC
+ib_core
+inband
+incrementing
+inductor
+inferencing
+inflight
+init
+initializer
+inlining
+installable
+interop
+interprocedural
+intra
+invariants
+invocating
+ipo
+jax
+kdb
+kfd
+kv
+lang
+latencies
+len
+libfabric
+libjpeg
+libs
+linearized
+linter
+linux
+llvm
+localscratch
+logits
+lossy
+macOS
+matchers
+microarchitecture
+migraphx
+miopen
+miopengemm
+mivisionx
+mjx
+mkdir
+mlirmiopen
+mtypes
+mutex
+mvffr
+namespace
+namespaces
+nanoGPT
+num
+numref
+ocl
+opencl
+opencv
+openmp
+openssl
+optimizers
+os
+oversubscription
+pageable
+pallas
+parallelization
+parallelizing
+parameterization
+passthrough
+perfcounter
+performant
+perl
+pragma
+pre
+prebuild
+prebuilt
+precompiled
+preconditioner
+preconfigured
+preemptible
+prefetch
+prefetchable
+prefill
+prefills
+preloaded
+preprocess
+preprocessed
+preprocessing
+preprocessor
+prequantized
+prerequisites
+profiler
+profilers
+protobuf
+pseudorandom
+py
+recommender
+recommenders
+quantile
+quantizer
+quasirandom
+queueing
+rccl
+rdc
+rdma
+reStructuredText
+redirections
+refactorization
+reformats
+repo
+repos
+representativeness
+req
+resampling
+rescaling
+reusability
+roadmap
+roc
+rocAL
+rocALUTION
+rocBLAS
+rocDecode
+rocFFT
+rocHPCG
+rocJPEG
+rocLIB
+rocMLIR
+rocPRIM
+rocPyDecode
+rocRAND
+rocSOLVER
+rocSPARSE
+rocThrust
+rocWMMA
+rocalution
+rocblas
+rocclr
+rocfft
+rocm
+rocminfo
+rocprim
+rocprof
+rocprofiler
+rocr
+rocrand
+rocsolver
+rocsparse
+rocthrust
+roctracer
+rst
+runtime
+runtimes
+sL
+scalability
+scalable
+scipy
+seealso
+sendmsg
+seqs
+serializers
+shader
+sharding
+sigmoid
+sm
+smi
+softmax
+spack
+src
+stochastically
+strided
+subcommand
+subdirectory
+subexpression
+subfolder
+subfolders
+submodule
+submodules
+supercomputing
+symlink
+symlinks
+sys
+td
+tensorfloat
+th
+tokenization
+tokenize
+tokenized
+tokenizer
+tokenizes
+toolchain
+toolchains
+toolset
+toolsets
+torchvision
+tqdm
+tracebacks
+txt
+uarch
+uncached
+uncorrectable
+underoptimized
+unhandled
+uninstallation
+unmapped
+unsqueeze
+unstacking
+unswitching
+untrusted
+untuned
+upvote
+USM
+UTCL
+UTIL
+utils
+vL
+variational
+vdi
+vectorizable
+vectorization
+vectorize
+vectorized
+vectorizer
+vectorizes
+virtualize
+virtualized
+vjxb
+voxel
+walkthrough
+walkthroughs
+watchpoints
+wavefront
+wavefronts
+whitespace
+whitespaces
+workgroup
+workgroups
+writeback
+writebacks
+wrreq
+wzo
+xargs
+xGMI
+xz
+yaml
+ysvmadyb
+zypper
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 086e831fcb95e..336eaf8882b28 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,6 @@
 #*********************************************************************
 # The Taichi Programming Language
 #*********************************************************************
-
 cmake_minimum_required(VERSION 3.17)
 
 project(taichi)
@@ -156,7 +155,7 @@ if (TI_WITH_CUDA)
 endif()
 
 if (TI_WITH_AMDGPU)
-    set(AMDGPU_ARCH "amdgpu")
+	set(AMDGPU_ARCH "amdgpu")
 endif()
 
 if (TI_WITH_DX12)
@@ -171,7 +170,7 @@ if (TI_WITH_LLVM)
       set (CLANG_EXECUTABLE ${CMAKE_CXX_COMPILER})
       message("Clang executable using host compiler ${CLANG_EXECUTABLE}")
     else()
-      find_program(CLANG_EXECUTABLE NAMES clang-15 clang-14 clang-13 clang-12 clang-11 clang-10 clang-9 clang-8 clang-7 clang)
+        find_program(CLANG_EXECUTABLE NAMES clang-20 clang-19 clang-18 clang-17 clang-16 clang-15 clang-14 clang-13 clang-12 clang-11 clang-10 clang-9 clang-8 clang-7 clang)
       message("Clang executable found at ${CLANG_EXECUTABLE}")
     endif()
 
@@ -193,7 +192,7 @@ if (TI_WITH_LLVM)
     endif()
 
     # Highest clang version that we've tested
-    set(CLANG_HIGHEST_VERSION "15")
+    set(CLANG_HIGHEST_VERSION "20")
 
     check_clang_version()
 
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
new file mode 100644
index 0000000000000..62ca57835f091
--- /dev/null
+++ b/Dockerfile.rocm
@@ -0,0 +1,116 @@
+# ---------------------------------------------
+# Stage 1: Build Taichi and generate artifacts
+# ---------------------------------------------
+ARG BASE_IMAGE=rocm/dev-ubuntu-24.04:7.0-complete
+FROM ${BASE_IMAGE} as taichi_build
+
+ARG LLVM_VERSION=20
+ARG GPU_TARGETS=gfx90a,gfx940,gfx941,gfx942,gfx950
+ARG ROCM_VERSION=7.0.0
+ARG PROJECT_NAME
+ARG RELEASE_VERSION
+ARG TI_VERSION_MAJOR
+ARG TI_VERSION_MINOR
+ARG TI_VERSION_PATCH
+
+ENV PROJECT_NAME=${PROJECT_NAME}
+ENV RELEASE_VERSION=${RELEASE_VERSION}
+ENV TI_VERSION_MAJOR=${TI_VERSION_MAJOR}
+ENV TI_VERSION_MINOR=${TI_VERSION_MINOR}
+ENV TI_VERSION_PATCH=${TI_VERSION_PATCH}
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV LLVM_PATH=/usr/lib/llvm-${LLVM_VERSION}
+ENV PATH=${LLVM_PATH}/bin:$PATH
+ENV ROCM_PATH=/opt/rocm-${ROCM_VERSION}
+
+ENV TAICHI_CMAKE_ARGS="-DTI_WITH_VULKAN=OFF -DTI_WITH_OPENGL=OFF -DTI_BUILD_TESTS=OFF -DTI_BUILD_EXAMPLES=OFF -DCMAKE_PREFIX_PATH=${LLVM_PATH}/lib/cmake -DCMAKE_CXX_COMPILER=${LLVM_PATH}/bin/clang++ -DTI_WITH_AMDGPU=ON -DTI_WITH_CUDA=OFF -DTI_AMDGPU_ARCHS=${GPU_TARGETS} -DUSE_LLD=ON -DTI_WITH_LLVM=ON"
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+	git wget vim git freeglut3-dev libglfw3-dev libglm-dev \
+	libglu1-mesa-dev libjpeg-dev liblz4-dev libpng-dev \
+	libssl-dev libwayland-dev libx11-xcb-dev libxcb-dri3-dev \
+	libxcb-ewmh-dev libxcb-keysyms1-dev libxcb-randr0-dev \
+	libxcursor-dev libxi-dev libxinerama-dev libxrandr-dev \
+	libzstd-dev python3-pip cmake pybind11-dev \
+	ca-certificates python3-venv rocm-llvm-dev \
+    gdb python3-dbg 
+
+WORKDIR /app
+
+RUN wget https://apt.llvm.org/llvm.sh \
+	&& chmod +x llvm.sh \
+	&& apt-get update && apt-get install -y \
+	lsb-release software-properties-common gnupg \
+	&& ./llvm.sh ${LLVM_VERSION} llvm clang lld 
+
+# Copy source code
+COPY . .
+# RUN git config --global --add safe.directory "${TAICHI_SRC}"
+
+# Install Python build dependencies
+RUN python3 -m pip config set global.break-system-packages true && \
+    python3 -m pip install -r requirements_dev.txt
+
+# Build Taichi and generate artifacts
+RUN cd external/spdlog && \
+    git apply /app/spdlog_fmt.patch && \
+    cd /app && \
+    rm -rf /root/.cache/* && \
+    ./build.py && \
+    echo "=== Listing dist directory ===" && \
+    ls -la dist/ && \
+    echo "=== Creating artifacts directory ===" && \
+    mkdir -p /tmp/artifacts && \
+    echo "=== Copying wheel files ===" && \
+    cp dist/*.whl /tmp/artifacts/ && \
+    echo "=== Listing artifacts directory ===" && \
+    ls -la /tmp/artifacts/ && \
+    tar -czvf /tmp/artifacts/taichi-tests.tar.gz -C /app --transform 's,^,taichi/,' --exclude='external' --exclude='_skbuild' .
+
+# ---------------------------------------------
+# Stage 2: Create runtime image with Taichi installed
+# ---------------------------------------------
+FROM ${BASE_IMAGE} as taichi_final
+
+ARG LLVM_VERSION=20
+ARG ROCM_VERSION=7.0.0
+ENV DEBIAN_FRONTEND=noninteractive
+ENV LLVM_PATH=/usr/lib/llvm-${LLVM_VERSION}
+ENV ROCM_PATH=/opt/rocm-${ROCM_VERSION}
+ENV PATH=${LLVM_PATH}/bin:$PATH
+
+# Install runtime dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+	git wget vim git freeglut3-dev libglfw3-dev libglm-dev \
+	libglu1-mesa-dev libjpeg-dev liblz4-dev libpng-dev \
+	libssl-dev libwayland-dev libx11-xcb-dev libxcb-dri3-dev \
+	libxcb-ewmh-dev libxcb-keysyms1-dev libxcb-randr0-dev \
+	libxcursor-dev libxi-dev libxinerama-dev libxrandr-dev \
+	libzstd-dev python3-pip cmake pybind11-dev \
+	ca-certificates python3-venv rocm-llvm-dev \
+    gdb python3-dbg 
+
+WORKDIR /app
+
+RUN wget https://apt.llvm.org/llvm.sh \
+	&& chmod +x llvm.sh \
+	&& apt-get update && apt-get install -y \
+	lsb-release software-properties-common gnupg \
+	&& ./llvm.sh ${LLVM_VERSION} llvm clang lld \
+    && rm -rf llvm.sh
+
+# Copy and install Taichi wheel
+COPY --from=taichi_build /tmp/artifacts/*.whl /app/
+
+RUN python3 -m pip config set global.break-system-packages true \
+    && ls -la /app/ \
+    && for whl in /app/*.whl; do python3 -m pip install "$whl"; done \
+    && rm -rf /app/*.whl
+
+# ---------------------------------------------
+# Stage 3: Export raw artifacts to host
+# ---------------------------------------------
+FROM scratch as taichi_export
+COPY --from=taichi_build /tmp/artifacts/ .
diff --git a/README.md b/README.md
index b7f06d3c48151..6e44f1d45a7d7 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ The language has broad applications spanning real-time physical simulation, nume
 - Flexibility: Taichi Lang provides a set of generic data containers known as *SNode* (/ˈsnoʊd/), an effective mechanism for composing hierarchical, multi-dimensional fields. This can cover many use patterns in numerical simulation (e.g. [spatially sparse computing](https://docs.taichi-lang.org/docs/sparse)).
 - Performance: With the `@ti.kernel` decorator, Taichi Lang's JIT compiler automatically compiles your Python functions into efficient GPU or CPU machine code for parallel execution.
 - Portability: Write your code once and run it everywhere. Currently, Taichi Lang supports most mainstream GPU APIs, such as CUDA and Vulkan.
-- ... and many more features! A cross-platform, Vulkan-based 3D visualizer, [differentiable programming](https://docs.taichi-lang.org/docs/differentiable_programming),  [quantized computation](https://github.com/taichi-dev/quantaichi) (experimental), etc.
+- Additional feature support: A cross-platform, Vulkan-based 3D visualizer, [differentiable programming](https://docs.taichi-lang.org/docs/differentiable_programming),  [quantized computation](https://github.com/taichi-dev/quantaichi) (experimental), and many more.
 
 ## Getting Started
 
@@ -69,7 +69,7 @@ Use Python's package installer **pip** to install Taichi Lang:
 pip install --upgrade taichi
 ```
 
-*We also provide a nightly package. Note that nightly packages may crash because they are not fully tested.  We cannot guarantee their validity, and you are at your own risk trying out our latest, untested features. The nightly packages can be installed from our self-hosted PyPI (Using self-hosted PyPI allows us to provide more frequent releases over a longer period of time)*
+*Nightly packages are also provided. Note: nightly packages may crash because they are not fully tested.  The AMD ROCm Taichi team does not guarantee their validity, and you are at your own risk trying out latest, untested features. The nightly packages can be installed from our self-hosted PyPI (Using self-hosted PyPI allows us to provide more frequent releases over a longer period of time)*
 
 ```bash
 pip install -i https://pypi.taichi.graphics/simple/ taichi-nightly
@@ -139,7 +139,7 @@ If you wish to try our experimental features or build Taichi Lang for your own e
 
 ## Contributing
 
-Kudos to all of our amazing contributors! Taichi Lang thrives through open-source. In that spirit, we welcome all kinds of contributions from the community. If you would like to participate, check out the [Contribution Guidelines](CONTRIBUTING.md) first.
+Thank you to all our amazing contributors! Taichi Lang thrives through open-source. In that spirit, we welcome all kinds of contributions from the community. If you would like to participate, review the [Contribution Guidelines](CONTRIBUTING.md) to get started.
 
 <a href="https://github.com/taichi-dev/taichi/graphs/contributors"><img src="https://raw.githubusercontent.com/taichi-dev/public_files/master/taichi/contributors_taichi-dev_taichi_18.png" width="800px"></a>
 
@@ -202,7 +202,7 @@ For more information about the events or community, please refer to [this page](
 
 ### Citations
 
-If you use Taichi Lang in your research, please cite the corresponding papers:
+To use Taichi Lang in your research, citations for corresponding papers are as follows:
 
 - [**(SIGGRAPH Asia 2019) Taichi: High-Performance Computation on Sparse Data Structures**](https://yuanming.taichi.graphics/publication/2019-taichi/taichi-lang.pdf) [[Video]](https://youtu.be/wKw8LMF3Djo) [[BibTex]](https://raw.githubusercontent.com/taichi-dev/taichi/master/misc/taichi_bibtex.txt) [[Code]](https://github.com/taichi-dev/taichi)
 - [**(ICLR 2020) DiffTaichi: Differentiable Programming for Physical Simulation**](https://arxiv.org/abs/1910.00935) [[Video]](https://www.youtube.com/watch?v=Z1xvAZve9aE) [[BibTex]](https://raw.githubusercontent.com/taichi-dev/taichi/master/misc/difftaichi_bibtex.txt) [[Code]](https://github.com/yuanming-hu/difftaichi)
diff --git a/benchmarks/microbenchmarks/_utils.py b/benchmarks/microbenchmarks/_utils.py
index 4c887386978b9..721db75ac98f5 100644
--- a/benchmarks/microbenchmarks/_utils.py
+++ b/benchmarks/microbenchmarks/_utils.py
@@ -53,7 +53,7 @@ def get_ti_arch(arch: str):
         "opengl": ti.opengl,
         "metal": ti.metal,
         "x64": ti.x64,
-        "cc": ti.cc,
+        "amdgpu":ti.amdgpu,
     }
     return arch_dict[arch]
 
diff --git a/benchmarks/microbenchmarks/atomic_ops.py b/benchmarks/microbenchmarks/atomic_ops.py
index 05abbc04e14eb..51c3d486c8d4a 100644
--- a/benchmarks/microbenchmarks/atomic_ops.py
+++ b/benchmarks/microbenchmarks/atomic_ops.py
@@ -31,7 +31,7 @@ def reduction_array(y: ti.types.ndarray(), x: ti.types.ndarray()):
 
 class AtomicOpsPlan(BenchmarkPlan):
     def __init__(self, arch: str):
-        super().__init__("atomic_ops", arch, basic_repeat_times=10)
+        super().__init__("atomic_ops", arch, basic_repeat_times=1)
         atomic_ops = AtomicOps()
         atomic_ops.remove(["atomic_sub", "atomic_and", "atomic_xor", "atomic_max"])
         self.create_plan(atomic_ops, Container(), DataType(), DataSize(), MetricType())
diff --git a/benchmarks/requirements.txt b/benchmarks/requirements.txt
index 625298e5a2b64..45ac07aadefaf 100644
--- a/benchmarks/requirements.txt
+++ b/benchmarks/requirements.txt
@@ -1,2 +1,3 @@
 jsbeautifier
 bokeh
+argparse
diff --git a/benchmarks/run.py b/benchmarks/run.py
index a1e47ed326e99..df651f5650bc3 100644
--- a/benchmarks/run.py
+++ b/benchmarks/run.py
@@ -1,5 +1,6 @@
 import os
 import warnings
+import argparse
 
 from suite_microbenchmarks import MicroBenchmark
 from taichi._lib import core as ti_python_core
@@ -23,9 +24,9 @@ def __init__(self):
         for suite in benchmark_suites:
             self._suites.append(suite())
 
-    def run(self):
+    def run(self, arch, benchmark_plan):
         for suite in self._suites:
-            suite.run()
+            suite.run(arch, benchmark_plan)
 
     def save(self, benchmark_dir="./"):
         for suite in self._suites:
@@ -39,15 +40,27 @@ def get_suites_info(self):
             info_dict[suite.suite_name] = suite.get_benchmark_info()
         return info_dict
 
+def parse_cmdln():
+    parser = argparse.ArgumentParser(prog='run.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--arch",
+            choices=['amdgpu', 'cuda', 'vulkan', 'opengl', 'metal', 'x64'],
+            required=True, help="Architecture to benchmark")
+    parser.add_argument("--benchmark_plan", 
+            choices=['AtomicOpsPlan', 'FillPlan', 'MathOpsPlan', 
+                'MatrixOpsPlan', 'MemcpyPlan', 'SaxpyPlan', 'Stencil2DPlan'],
+            required=True, help="Benchmark plan to run")
+    args = parser.parse_args()
+    return args
 
 def main():
+    args = parse_cmdln()
     benchmark_dir = os.path.join(os.getcwd(), "results")
     os.makedirs(benchmark_dir, exist_ok=True)
 
     # init & run
     info = BenchmarkInfo()
     suites = BenchmarkSuites()
-    suites.run()
+    suites.run(args.arch, args.benchmark_plan)
     # save benchmark results & info
     suites.save(benchmark_dir)
     info.suites = suites.get_suites_info()
diff --git a/benchmarks/suite_microbenchmarks.py b/benchmarks/suite_microbenchmarks.py
index e00aa90a3ae89..05f581a0a9dd8 100644
--- a/benchmarks/suite_microbenchmarks.py
+++ b/benchmarks/suite_microbenchmarks.py
@@ -8,9 +8,10 @@
 class MicroBenchmark:
     suite_name = "microbenchmarks"
     config = {
-        "cuda": {"enable": True},
+        "cuda": {"enable": False},
         "vulkan": {"enable": False},
         "opengl": {"enable": False},
+        "amdgpu": {"enable": True},
     }
 
     def __init__(self):
@@ -25,19 +26,34 @@ def get_benchmark_info(self):
                 arch_list.append(arch)
         info_dict["archs"] = arch_list
         return info_dict
+    
+    def str_to_plan(self, plan_str):
+        match plan_str:
+            case "AtomicOpsPlan":
+                return benchmark_plan_list[0]
+            case "FillPlan":
+                return benchmark_plan_list[1]
+            case "MathOpsPlan":
+                return benchmark_plan_list[2]
+            case "MatrixOpsPlan":
+                return benchmark_plan_list[3]
+            case "MemcpyPlan":
+                return benchmark_plan_list[4]
+            case "SaxpyPlan":
+                return benchmark_plan_list[5]
+            case "Stencil2DPlan":
+                return benchmark_plan_list[6]
 
-    def run(self):
-        for arch, item in self.config.items():
-            if item["enable"] == True:
-                arch_results = {}
-                self._info[arch] = {}
-                for plan in benchmark_plan_list:
-                    plan_impl = plan(arch)
-                    results = plan_impl.run()
-                    self._info[arch][plan_impl.name] = results["info"]
-                    arch_results[plan_impl.name] = results["results"]
-
-                self._results[arch] = arch_results
+    def run(self, arch, plan_str):
+        arch_results = {}
+        self._info[arch] = {}
+        plan = self.str_to_plan(plan_str)
+        plan_impl = plan(arch)
+        results = plan_impl.run()
+        self._info[arch][plan_impl.name] = results["info"]
+        arch_results[plan_impl.name] = results["results"]
+
+        self._results[arch] = arch_results
 
     def save_as_json(self, suite_dir="./"):
         for arch in self._results:
diff --git a/ci/assets/mitm-ca.crt b/ci/assets/mitm-ca.crt
deleted file mode 100644
index 35ac8064ade73..0000000000000
--- a/ci/assets/mitm-ca.crt
+++ /dev/null
@@ -1,25 +0,0 @@
------BEGIN CERTIFICATE-----
-MIIESTCCAzGgAwIBAgIUGVTxoAKRFazNmE0SGx7FMbAIeVgwDQYJKoZIhvcNAQEL
-BQAwgbMxCzAJBgNVBAYTAkNOMRAwDgYDVQQIDAdCZWlqaW5nMRgwFgYDVQQKDA9U
-YWljaGkgR3JhcGhpY3MxGjAYBgNVBAsMEVRhaWNoaSBMYW5nIENJL0NEMTQwMgYD
-VQQDDCtUYWljaGkgQnVpbGRib3RzIE1JVE0gQ2VydGlmaWNhdGUgQXV0aG9yaXR5
-MSYwJAYJKoZIhvcNAQkBFhdiaW53YW5nQHRhaWNoaS5ncmFwaGljczAeFw0yMjEx
-MDkwODU2MDRaFw0zMjExMDYwODU2MDRaMIGzMQswCQYDVQQGEwJDTjEQMA4GA1UE
-CAwHQmVpamluZzEYMBYGA1UECgwPVGFpY2hpIEdyYXBoaWNzMRowGAYDVQQLDBFU
-YWljaGkgTGFuZyBDSS9DRDE0MDIGA1UEAwwrVGFpY2hpIEJ1aWxkYm90cyBNSVRN
-IENlcnRpZmljYXRlIEF1dGhvcml0eTEmMCQGCSqGSIb3DQEJARYXYmlud2FuZ0B0
-YWljaGkuZ3JhcGhpY3MwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQCo
-l0vo88tMrh5pOe7YUuGxqP0CE78qaESvHNt3L8fG4C9skHzI9uBQDkKyp8vTi6hu
-luTsPJ2wF59ok8q/RCu0fDT3k1myuBRVsxg7VB7ekubdsfxgyJIqd5c2GByf3+rD
-S/emPNZGCZ6VOrBapJadbPawiCO+NBMQZkFAMK2rDfPii/fxmssZ5DF75PHE6BON
-oHOUv7gz6FDOQn1w6VacSn1xNdWtQRo974HtIJs8ODoi4e4Gt1HkS0GxTQ0sLQad
-wxcGTaIJafrYizhTO3xCo/57o7ckS3Hd4rX21C1atLnlhPAPJ93MbVwMSWJfGPIA
-RH92Wj6psI7ryXsgD6XrAgMBAAGjUzBRMB0GA1UdDgQWBBQompdQJjwjq49IJPOi
-Gwfh3g5mizAfBgNVHSMEGDAWgBQompdQJjwjq49IJPOiGwfh3g5mizAPBgNVHRMB
-Af8EBTADAQH/MA0GCSqGSIb3DQEBCwUAA4IBAQARYA5Ul/WITZf3m2fxp4yNrs3p
-zBpQdqS8O8Y/0U3kvNBSRvXUzThHnzUSC47M1lpXpDS7OVBA3gmpWNG9hsi1BVlh
-q44amtZWNWMzPI7KtxC/BcxMER06iBv/3NdIL2gvAW8+mrGbT5UxjiKRFqgSpCeN
-A6/NxlpG2tiMRsRx7l7OnybPvmxP9/hx+3OFq0T2gHbOtdmvggIp4Oe4O8fQUGOj
-x0Ug2nEqzY/XYp8uj4NUge/3HGDFgOkGK+hRkVDVWOY3GE3yGJx+46ifDRChYGuU
-rugHTLTg1YmbYAH2HtwwvPSPWXn/b50oSNXeiiW/Y88GLXKK7TWlVbVQKNAT
------END CERTIFICATE-----
diff --git a/cmake/TaichiCXXFlags.cmake b/cmake/TaichiCXXFlags.cmake
index 58b9a5a3ded3d..8d6b56718565d 100644
--- a/cmake/TaichiCXXFlags.cmake
+++ b/cmake/TaichiCXXFlags.cmake
@@ -56,8 +56,8 @@ if (WIN32)
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267 /wd4624 /nologo /D \"_CRT_SECURE_NO_WARNINGS\" /D \"_ENABLE_EXTENDED_ALIGNED_STORAGE\"")
     else()
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -fsized-deallocation -target x86_64-pc-windows-msvc")
-        set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -gcodeview")
-        set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -gcodeview")
+        set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -gcodeview -g")
+        set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -gcodeview -g")
     endif()
 else()
     if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
index b716b071d48fc..0c22d7331dcdc 100644
--- a/cmake/TaichiCore.cmake
+++ b/cmake/TaichiCore.cmake
@@ -106,7 +106,7 @@ if (TI_WITH_CUDA)
 endif()
 
 if (TI_WITH_AMDGPU)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_AMDGPU")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -DTI_WITH_AMDGPU")
   file(GLOB TAICHI_AMDGPU_RUNTIME_SOURCE "taichi/runtime/amdgpu/runtime.cpp")
   list(APPEND TAIHI_CORE_SOURCE ${TAICHI_AMDGPU_RUNTIME_SOURCE})
 endif()
diff --git a/docs/about/license.rst b/docs/about/license.rst
new file mode 100644
index 0000000000000..85a8ca410fbb2
--- /dev/null
+++ b/docs/about/license.rst
@@ -0,0 +1,12 @@
+.. meta::
+  :description: Taichi Lang license
+  :keywords: Taichi Lang, parallel programming, ROCm, developer, reference, python
+
+.. _license:
+
+******************************************
+License
+******************************************
+
+.. include:: ../../LICENSE
+  :literal:
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000000000..8ae88ec38c22a
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,58 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+import re
+
+'''
+html_theme is usually unchanged (rocm_docs_theme).
+flavor defines the site header display, select the flavor for the corresponding portals
+flavor options: rocm, rocm-docs-home, rocm-blogs, rocm-ds, instinct, ai-developer-hub, local, generic
+'''
+html_theme = "rocm_docs_theme"
+html_theme_options = {"flavor": "rocm-simulation", "repository_url": "https://github.com/ROCm/taichi/"}
+
+'''
+docs_header_version is used to manually configure the version in the header. If
+there exists a non-null value mapped to docs_header_version, then the header in
+the documentation page will contain the given version string.
+'''
+html_context = {
+    "docs_header_version": "25.11"
+}
+
+
+# This section turns on/off article info
+setting_all_article_info = True
+all_article_info_os = ["linux"]
+all_article_info_author = ""
+
+# Dynamically extract component version
+with open('../CMakeLists.txt', encoding='utf-8') as f:
+    pattern = r'.*\brocm_setup_version\(VERSION\s+([0-9A-Za-z._-]+)' # Update according to each component's CMakeLists.txt
+    match = re.search(pattern,
+                      f.read())
+    if not match:
+        raise ValueError("VERSION not found!")
+    version_number = match[1]
+
+# for PDF output on Read the Docs
+project = "Taichi Lang"
+author = "Advanced Micro Devices, Inc."
+copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
+version = version_number
+release = version_number
+
+external_toc_path = "./sphinx/_toc.yml" # Defines Table of Content structure definition path
+
+# Add more addtional package accordingly
+extensions = [
+    "rocm_docs", 
+    "sphinx.ext.autodoc", # for Python docstrings
+] 
+
+html_title = f"{project} {version_number} documentation"
+
+external_projects_current_project = "Taichi Lang"
diff --git a/docs/examples/taichi-examples.rst b/docs/examples/taichi-examples.rst
new file mode 100644
index 0000000000000..b4f0683fda779
--- /dev/null
+++ b/docs/examples/taichi-examples.rst
@@ -0,0 +1,140 @@
+.. meta::
+  :description: Taichi Lang examples
+  :keywords: Taichi Lang, JIT, parallel, programming, python, ROCm, example, sample, tutorial
+
+.. _run-a-taichi-example:
+
+********************************************************************
+Run a Taichi Lang example
+********************************************************************
+
+Prepared Taichi Lang examples using decorators
+================================================================================
+
+The following examples show you how to use decorators, and is organized by use case.
+
+Count primes
+--------------------------------------------------------------------------------
+
+In this example, the function ``is_prime`` will be used in the kernel ``count_primes``. 
+The code below is written as a Taichi program by decorating ``is_prime``
+with the Taichi Lang decorator ``@ti.func`` and decorating ``count_primes`` with the Taichi
+Lang decorator ``@ti.kernel``.
+
+1. To run this example, copy the code below to a file named ``count_primes.py``:
+
+.. code-block:: bash
+
+   import taichi as ti
+   ti.init(arch=ti.gpu)
+
+   @ti.func
+   def is_prime(n: int):
+      result = True
+      for k in range(2, int(n ** 0.5) + 1):
+         if n % k == 0:
+               result = False
+               break
+      return result
+
+   @ti.kernel
+   def count_primes(n: int) -> int:
+      count = 0
+      for k in range(2, n):
+         if is_prime(k):
+               count += 1
+
+      return count
+
+   print(count_primes(1000000))
+
+2. Once this file has been created, execute the code in your Docker container with the following command:
+
+.. code-block:: bash
+
+   python3 count_primes.py
+
+3. The output should be similar to the output below:
+
+.. code-block:: bash
+
+   [Taichi] version 1.8.0b2, llvm 20.0.0, commit f7911653, linux, python 3.12.3
+   [Taichi] Starting on arch=amdgpu
+   78498
+
+Longest common subsequence 
+--------------------------------------------------------------------------------
+
+This is an example of longest common subsequence kernel. You don't
+need a helper function, so the only decorator needed is ``@ti.kernel`` to 
+accelerate the kernel function ``compute_lcs``.
+
+1. To run this example, copy the code below into a file named ``lcs.py``:
+
+.. code-block:: bash
+
+   import taichi as ti
+   import numpy as np
+
+   ti.init(arch=ti.gpu)
+
+   benchmark = True
+
+   N = 15000
+
+   f = ti.field(dtype=ti.i32, shape=(N + 1, N + 1))
+
+   if benchmark:
+      a_numpy = np.random.randint(0, 100, N, dtype=np.int32)
+      b_numpy = np.random.randint(0, 100, N, dtype=np.int32)
+   else:
+      a_numpy = np.array([0, 1, 0, 2, 4, 3, 1, 2, 1], dtype=np.int32)
+      b_numpy = np.array([4, 0, 1, 4, 5, 3, 1, 2], dtype=np.int32)
+
+   @ti.kernel
+   def compute_lcs(a: ti.types.ndarray(), b: ti.types.ndarray()) -> ti.i32:
+      len_a, len_b = a.shape[0], b.shape[0]
+
+      ti.loop_config(serialize=True) # Disable auto-parallelism in Taichi
+      for i in range(1, len_a + 1):
+         for j in range(1, len_b + 1):
+                  f[i, j] = ti.max(f[i - 1, j - 1] + (a[i - 1] == b[j - 1]),
+                           ti.max(f[i - 1, j], f[i, j - 1]))
+
+      return f[len_a, len_b]
+
+
+   print(compute_lcs(a_numpy, b_numpy))
+
+2. Once this file has been created, execute the code in your Docker container with the following command:
+
+.. code-block:: bash
+
+   python3 lcs.py
+
+3. The output should be similar to the output below:
+
+.. code-block:: bash
+
+   [Taichi] version 1.8.0b2, llvm 20.0.0, commit f7911653, linux, python 3.12.3
+   [Taichi] Starting on arch=amdgpu
+   2706
+
+Use cases and recommendations
+================================================================================
+
+* The `Modernizing Taichi Lang to LLVM 20 for MI325X GPU Acceleration
+  <https://rocm.blogs.amd.com/artificial-intelligence/taichi_mi300x/README.html>`__
+  blog highlights Taichi Lang as an open-source programming language designed for high-performance
+  numerical computation, particularly in domains such as real-time physical simulation,
+  artificial intelligence, computer vision, robotics, and visual effects. Taichi Lang
+  is embedded in Python and uses just-in-time (JIT) compilation frameworks like
+  LLVM to optimize execution on GPUs and CPUs. The blog emphasizes the versatility
+  of Taichi Lang in enabling complex simulations and numerical algorithms, making
+  it ideal for developers working on compute-intensive tasks. Developers are
+  encouraged to follow recommended coding patterns and utilize Taichi Lang decorators
+  for performance optimization. Prebuilt Docker images integrating ROCm, PyTorch, and
+  Taichi are provided for simplified installation and deployment, making it easier
+  to leverage Taichi Lang for advanced computational workloads.
+
+Refer to the `AMD ROCm blog <https://rocm.blogs.amd.com/>`__ to search for Taichi examples and best practices to optimize your workflows on AMD GPUs.
diff --git a/docs/examples/taichi-get-started.rst b/docs/examples/taichi-get-started.rst
new file mode 100644
index 0000000000000..56207fd6bfb00
--- /dev/null
+++ b/docs/examples/taichi-get-started.rst
@@ -0,0 +1,64 @@
+.. meta::
+  :description: Get started using Taichi Lang
+  :keywords: Taichi Lang, JIT, parallel, programming, python, ROCm, example, sample, tutorial
+
+.. _taichi-get-started:
+
+********************************************************************
+Get started with Taichi Lang
+********************************************************************
+
+When writing compute-intensive tasks, you can make use of the two 
+decorators ``@ti.func`` and ``@ti.kernel``. Functions decorated 
+with ``@ti.kernel`` are kernels that serve as the entry points where
+Taichi’s runtime takes over the tasks, and they must be directly
+invoked by Python code. Functions decorated with ``@ti.func`` are
+building blocks of kernels and can only be invoked by another
+Taichi Lang function or a kernel. These decorators instruct Taichi
+Lang to take over the computation tasks and compile the decorated
+functions to machine code using just-in-time (JIT) compiler. As a
+result, calls to these functions are executed on multi-core CPUs
+or GPUs.
+
+Below you can see how simple it is to use the Taichi Lang ``@ti.func``
+and ``@ti.kernel`` decorators to accelerate Python code. First, you 
+can see the Python code without using Taichi Lang. To enact Taichi Lang 
+in this example, the function ``inv_square`` serves as a building 
+block function for the kernel ``partial_sum``.
+
+The example Python code without Taichi Lang:
+
+.. code-block:: bash
+
+    def inv_square(x):  # A function
+        return 1.0 / (x * x)
+
+    def partial_sum(n: int) -> float:  # A kernel
+        total = 0.0
+        for i in range(1, n + 1):
+            total += inv_square(i)
+        return total
+
+    partial_sum(1000)
+
+1. To write this code in Taichi Lang, import and initialize Taichi Lang for code acceleration:
+
+.. code-block:: bash
+   
+   import taichi as ti
+   ti.init(arch=ti.gpu)
+
+2. Then, you can decorate the building block function and kernel with the ``@ti.func`` and ``@ti.kernel`` decorators, respectively:
+
+.. code-block:: bash
+
+   @ti.func
+   def inv_square(x):  # A Taichi function
+       return 1.0 / (x * x)
+
+   @ti.kernel
+   def partial_sum(n: int) -> float:  # A kernel
+       total = 0.0
+       for i in range(1, n + 1):
+           total += inv_square(i)
+       return total
\ No newline at end of file
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000000000..8ecc6431ad235
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,39 @@
+.. meta::
+  :description: Taichi Lang documentation
+  :keywords: Taichi Lang, ROCm, documentation, LLVM, JIT, parallel programming, GPU
+
+.. _taichi-documentation-index:
+
+********************************************************************
+Taichi Lang documentation
+********************************************************************
+
+`Taichi Lang <https://www.taichi-lang.org/>`_ is an open-source, imperative, and parallel 
+programming language designed for high-performance numerical computation. Taichi Lang is 
+part of the AMD ROCm™ Simulation Domain toolkit (ROCm-Simulation). Embedded in Python,
+it leverages just-in-time (JIT) compilation frameworks such as LLVM to accelerate 
+compute-intensive Python code by compiling it to native GPU or CPU instructions.
+
+Taichi Lang is widely used across various domains, such as real-time physical simulation, 
+numerical computing, augmented reality, artificial intelligence, computer vision, robotics, 
+visual effects for film and gaming, and general-purpose computing.
+
+The Taichi Lang public repository is located at `https://github.com/ROCm/taichi <https://github.com/ROCm/taichi>`__.
+
+.. grid:: 2
+  :gutter: 3
+
+  .. grid-item-card:: Install
+
+    * :doc:`Install Taichi Lang <install/taichi-install>`
+
+  .. grid-item-card:: Examples
+
+    * :doc:`Get started with Taichi Lang <examples/taichi-get-started>`  
+    * :doc:`Run a Taichi Lang example <examples/taichi-examples>`  
+
+
+To contribute to the documentation, refer to
+`Contributing to ROCm <https://rocm.docs.amd.com/en/latest/contribute/contributing.html>`_.
+
+You can find licensing information on the :doc:`Licensing <about/license>` page.
diff --git a/docs/install/taichi-install.rst b/docs/install/taichi-install.rst
new file mode 100644
index 0000000000000..45f9c1cbe608d
--- /dev/null
+++ b/docs/install/taichi-install.rst
@@ -0,0 +1,289 @@
+.. meta::
+  :description: installing Taichi Lang for ROCm
+  :keywords: installation instructions, parallel programming, JIT, LLVM, AMD, ROCm, Taichi Lang
+
+.. _taichi-on-rocm-installation:
+
+********************************************************************
+Taichi Lang on ROCm installation
+********************************************************************
+
+System requirements
+====================================================================
+
+To use Taichi Lang `1.8.0b2 <https://github.com/ROCm/taichi/tree/release/1.8.0b2>`__, you need the following prerequisites:
+
+- **ROCm version:** `7.0.0 <https://repo.radeon.com/rocm/apt/7.0/>`__ (recommended)
+- **Operating system:** Ubuntu 22.04, 24.04
+- **GPU platform:** AMD Instinct™ MI355X, MI325X, MI300X, MI250X, MI210
+- **Python:** `3.12.3 <https://www.python.org/downloads/release/python-3123/>`__, `3.10.12 <https://www.python.org/downloads/release/python-31012/>`__
+
+Install Taichi
+================================================================================
+
+To install Taichi Lang on ROCm, you have the following options:
+
+- :ref:`Use the prebuilt Docker image <using-docker-with-taichi-pre-installed>` **(recommended)**
+- :ref:`Use a wheels package <taichi-wheels-package>`
+- :ref:`Build your own docker image <build-taichi-rocm-docker-image>`
+
+.. _using-docker-with-taichi-pre-installed:
+
+Use a prebuilt Docker image with Taichi Lang pre-installed
+--------------------------------------------------------------------------------
+
+Docker is the recommended method to set up a Taichi Lang environment, as it avoids potential installation issues. 
+The tested, prebuilt image includes Taichi, Python, ROCm, and other dependencies.
+
+1. Pull the Docker image:
+
+   .. tab-set::
+
+      .. tab-item:: Ubuntu 24.04
+         :sync: ubuntu-24
+
+         .. code-block:: shell
+
+            docker pull rocm/taichi:taichi-1.8.0b2_rocm7.0.0_ubuntu24.04_py3.12.3
+
+         See `rocm/taichi:taichi-1.8.0b2_rocm7.0.0_ubuntu24.04_py3.12.3
+         <https://hub.docker.com/layers/rocm/taichi/taichi-1.8.0b2_rocm7.0.0_ubuntu24.04_py3.12.3/images/sha256-15bb5ad69e0dff0cc5e0805a5d271a82d04d6d9c32d668bfcf34981ca99f68eb>`__
+         on Docker Hub.
+
+      .. tab-item:: Ubuntu 22.04
+         :sync: ubuntu-22
+
+         .. code-block:: shell
+
+            docker pull rocm/taichi:taichi-1.8.0b2_rocm7.0.0_ubuntu22.04_py3.10.12
+
+         See `rocm/taichi:taichi-1.8.0b2_rocm7.0.0_ubuntu22.04_py3.10.12
+         <https://hub.docker.com/layers/rocm/taichi/taichi-1.8.0b2_rocm7.0.0_ubuntu22.04_py3.10.12/images/sha256-151516ac981ab1309fca0d868e25dd1231728449b6563b5e8c660d7cb47777fb>`__
+         on Docker Hub.
+
+2. Launch and connect to the container:
+
+   .. tab-set::
+
+      .. tab-item:: Ubuntu 24.04
+         :sync: ubuntu-24
+
+         .. code-block:: shell
+
+            docker run -it -d \
+               --cap-add=SYS_PTRACE \
+               --security-opt seccomp=unconfined \
+               --ipc=host \
+               --shm-size=64G \
+               --network=host \
+               --device=/dev/kfd \
+               --device=/dev/dri \
+               --group-add video \
+               -v "$(pwd)":/taichi_dir \
+               --name rocm_taichi \
+               rocm/taichi:taichi-1.8.0b2_rocm7.0.0_ubuntu24.04_py3.12.3
+
+      .. tab-item:: Ubuntu 22.04
+         :sync: ubuntu-22
+
+         .. code-block:: shell
+
+            docker run -it -d \
+               --cap-add=SYS_PTRACE \
+               --security-opt seccomp=unconfined \
+               --ipc=host \
+               --shm-size=64G \
+               --network=host \
+               --device=/dev/kfd \
+               --device=/dev/dri \
+               --group-add video \
+               -v "$(pwd)":/taichi_dir \
+               --name rocm_taichi \
+               rocm/taichi:taichi-1.8.0b2_rocm7.0.0_ubuntu22.04_py3.10.12
+
+.. _taichi-wheels-package:
+
+Use a wheels package
+--------------------------------------------------------------------------------
+
+The Taichi Lang ``.whl`` packages are hosted on the AMD PyPI repository. 
+Instead of manually downloading the files, you can simply install Taichi Lang using ``pip`` with the provided URL. 
+This command will automatically download and install the appropriate ``.whl`` file.
+
+.. code-block:: bash
+
+   pip install amd-taichi==1.8.0b2 --index-url=https://pypi.amd.com/simple
+   sudo apt-get update
+   sudo apt-get install -y lld
+
+.. _build-taichi-rocm-docker-image:
+
+Build your own Docker image
+--------------------------------------------------------------------------------
+
+If you prefer to use the ROCm Ubuntu image, or already have a ROCm Ubuntu container, follow these steps to install Taichi in the container.
+
+1. Pull the ROCm Ubuntu Docker image:
+
+   .. tab-set::
+
+      .. tab-item:: Ubuntu 24.04
+         :sync: ubuntu-24
+
+         .. code-block:: shell
+
+            docker pull rocm/dev-ubuntu-24.04:7.0-complete
+
+         See `rocm/dev-ubuntu-24.04:7.0-complete
+         <https://hub.docker.com/layers/rocm/dev-ubuntu-24.04/7.0-complete/images/sha256-ffd8ac00ca6c8e2dbfd0c364c7cc27542f90148f3f358d74efd028f67c33607b>`__
+         on Docker Hub.
+
+      .. tab-item:: Ubuntu 22.04
+         :sync: ubuntu-22
+
+         .. code-block:: shell
+
+            docker pull rocm/dev-ubuntu-22.04:7.0-complete
+
+         See `rocm/dev-ubuntu-22.04:7.0-complete
+         <https://hub.docker.com/layers/rocm/dev-ubuntu-22.04/7.0-complete/images/sha256-b4be4b0b29e46d56e9bea2cd06500f4519aaac30dc5df02bd4710bbf393c1c4c>`__
+         on Docker Hub.
+
+2. Launch the Docker container:
+
+   .. tab-set::
+
+      .. tab-item:: Ubuntu 24.04
+         :sync: ubuntu-24
+
+         .. code-block:: shell
+
+            docker run -it -d \
+                  --cap-add=SYS_PTRACE \
+                  --security-opt seccomp=unconfined \
+                  --ipc=host \
+                  --shm-size=64G \
+                  --network=host \
+                  --device=/dev/kfd \
+                  --device=/dev/dri \
+                  --group-add video \
+                  -v "$(pwd)":/taichi_dir \
+                  --name rocm_taichi \
+                  rocm/dev-ubuntu-24.04:7.0-complete \
+                  /bin/bash
+
+      .. tab-item:: Ubuntu 22.04
+         :sync: ubuntu-22
+
+         .. code-block:: shell
+
+            docker run -it -d \
+                  --cap-add=SYS_PTRACE \
+                  --security-opt seccomp=unconfined \
+                  --ipc=host \
+                  --shm-size=64G \
+                  --network=host \
+                  --device=/dev/kfd \
+                  --device=/dev/dri \
+                  --group-add video \
+                  -v "$(pwd)":/taichi_dir \
+                  --name rocm_taichi \
+                  rocm/dev-ubuntu-22.04:7.0-complete \
+                  /bin/bash
+
+3. Inside the running container, install build dependencies:
+
+   .. code-block:: bash
+
+      apt-get update && apt-get install -y --no-install-recommends \
+         git wget vim git freeglut3-dev libglfw3-dev libglm-dev \
+         libglu1-mesa-dev libjpeg-dev liblz4-dev libpng-dev \
+         libssl-dev libwayland-dev libx11-xcb-dev libxcb-dri3-dev \
+         libxcb-ewmh-dev libxcb-keysyms1-dev libxcb-randr0-dev \
+         libxcursor-dev libxi-dev libxinerama-dev libxrandr-dev \
+         libzstd-dev python3-pip cmake pybind11-dev \
+         ca-certificates python3-venv rocm-llvm-dev \
+         gdb python3-dbg 
+
+4. Setup LLVM:
+
+   .. code-block:: bash
+      
+      export LLVM_VERSION=20
+      export LLVM_PATH=/usr/lib/llvm-${LLVM_VERSION}
+      export PATH=${LLVM_PATH}/bin:$PATH
+      
+      wget https://apt.llvm.org/llvm.sh \
+         && chmod +x llvm.sh \
+         && apt-get update && apt-get install -y \
+         lsb-release software-properties-common gnupg \
+         && ./llvm.sh ${LLVM_VERSION} llvm clang lld
+
+5. Clone the `https://github.com/ROCm/taichi <https://github.com/ROCm/taichi>`_ repository with the desired branch:
+
+   .. code-block:: bash
+      
+      cd /taichi_dir
+      git clone --recursive https://github.com/ROCm/taichi -b release/v1.8.0b2
+      cd taichi
+
+6. Build the Taichi Lang wheel:
+
+   .. code-block:: bash
+
+      export GPU_TARGETS=gfx950,gfx942,gfx90a
+      export TAICHI_CMAKE_ARGS="-DTI_WITH_VULKAN=OFF -DTI_WITH_OPENGL=OFF -DTI_BUILD_TESTS=ON -DTI_BUILD_EXAMPLES=OFF -DCMAKE_PREFIX_PATH=${LLVM_PATH}/lib/cmake -DCMAKE_CXX_COMPILER=${LLVM_PATH}/bin/clang++ -DTI_WITH_AMDGPU=ON -DTI_WITH_CUDA=OFF -DTI_AMDGPU_ARCHS=${GPU_TARGETS} -DUSE_LLD=ON -DTI_WITH_LLVM=ON"
+
+      cd /taichi_dir/taichi/external/spdlog \
+      && git apply /taichi_dir/taichi/spdlog_fmt.patch \
+      && cd /taichi_dir/taichi \
+      && ./build.py
+
+7. Install the Taichi Lang ``.whl`` file:
+
+   .. code-block:: bash
+
+      python3 -m pip config set global.break-system-packages true \
+      && python3 -m pip install /taichi_dir/taichi/dist/taichi*.whl
+
+
+.. _build-taichi-docker-from-source:
+
+Test the Taichi Lang installation
+================================================================================
+
+Clone the `https://github.com/ROCm/taichi <https://github.com/ROCm/taichi>`_ repository:
+
+.. code-block:: bash
+
+   sudo apt-get update
+   sudo apt-get install -y git
+   git clone --recursive https://github.com/ROCm/taichi -b amd-release/v1.8.0b2
+
+To test the Taichi Lang installation, run the ``laplace`` example in the source code: 
+
+.. code-block:: bash
+
+   python3 taichi/python/taichi/examples/algorithm/laplace.py
+
+Example output using ``laplace``:
+
+.. code-block:: bash
+
+   [Taichi] version 1.8.0, llvm 15.0.0, commit f7911653, linux, python python 3.12.3
+   [Taichi] Starting on arch=amdgpu
+   0.0
+   4.0
+   0.0
+   0.0
+   4.0
+   0.0
+   0.0
+   4.0
+   0.0
+   0.0
+
+Run a Taichi Lang example
+====================================================================
+
+A set of examples is available to help you get started. See :doc:`run a Taichi Lang example <../examples/taichi-examples>` for more details.
diff --git a/docs/conftest.py b/docs/source/conftest.py
similarity index 100%
rename from docs/conftest.py
rename to docs/source/conftest.py
diff --git a/docs/cover-in-ci.lst b/docs/source/cover-in-ci.lst
similarity index 100%
rename from docs/cover-in-ci.lst
rename to docs/source/cover-in-ci.lst
diff --git a/docs/design/llvm_sparse_runtime.md b/docs/source/design/llvm_sparse_runtime.md
similarity index 100%
rename from docs/design/llvm_sparse_runtime.md
rename to docs/source/design/llvm_sparse_runtime.md
diff --git a/docs/fragments/.gitkeep b/docs/source/fragments/.gitkeep
similarity index 100%
rename from docs/fragments/.gitkeep
rename to docs/source/fragments/.gitkeep
diff --git a/docs/lang/articles/about/_category_.json b/docs/source/lang/articles/about/_category_.json
similarity index 100%
rename from docs/lang/articles/about/_category_.json
rename to docs/source/lang/articles/about/_category_.json
diff --git a/docs/lang/articles/about/overview.md b/docs/source/lang/articles/about/overview.md
similarity index 100%
rename from docs/lang/articles/about/overview.md
rename to docs/source/lang/articles/about/overview.md
diff --git a/docs/lang/articles/advanced/_category_.json b/docs/source/lang/articles/advanced/_category_.json
similarity index 100%
rename from docs/lang/articles/advanced/_category_.json
rename to docs/source/lang/articles/advanced/_category_.json
diff --git a/docs/lang/articles/advanced/argument_pack.md b/docs/source/lang/articles/advanced/argument_pack.md
similarity index 100%
rename from docs/lang/articles/advanced/argument_pack.md
rename to docs/source/lang/articles/advanced/argument_pack.md
diff --git a/docs/lang/articles/advanced/data_oriented_class.md b/docs/source/lang/articles/advanced/data_oriented_class.md
similarity index 100%
rename from docs/lang/articles/advanced/data_oriented_class.md
rename to docs/source/lang/articles/advanced/data_oriented_class.md
diff --git a/docs/lang/articles/advanced/dataclass.md b/docs/source/lang/articles/advanced/dataclass.md
similarity index 100%
rename from docs/lang/articles/advanced/dataclass.md
rename to docs/source/lang/articles/advanced/dataclass.md
diff --git a/docs/lang/articles/advanced/meta.md b/docs/source/lang/articles/advanced/meta.md
similarity index 100%
rename from docs/lang/articles/advanced/meta.md
rename to docs/source/lang/articles/advanced/meta.md
diff --git a/docs/lang/articles/advanced/odop.md b/docs/source/lang/articles/advanced/odop.md
similarity index 100%
rename from docs/lang/articles/advanced/odop.md
rename to docs/source/lang/articles/advanced/odop.md
diff --git a/docs/lang/articles/advanced/quant.md b/docs/source/lang/articles/advanced/quant.md
similarity index 100%
rename from docs/lang/articles/advanced/quant.md
rename to docs/source/lang/articles/advanced/quant.md
diff --git a/docs/lang/articles/basic/_category_.json b/docs/source/lang/articles/basic/_category_.json
similarity index 100%
rename from docs/lang/articles/basic/_category_.json
rename to docs/source/lang/articles/basic/_category_.json
diff --git a/docs/lang/articles/basic/external.md b/docs/source/lang/articles/basic/external.md
similarity index 100%
rename from docs/lang/articles/basic/external.md
rename to docs/source/lang/articles/basic/external.md
diff --git a/docs/lang/articles/basic/field.md b/docs/source/lang/articles/basic/field.md
similarity index 100%
rename from docs/lang/articles/basic/field.md
rename to docs/source/lang/articles/basic/field.md
diff --git a/docs/lang/articles/basic/layout.md b/docs/source/lang/articles/basic/layout.md
similarity index 100%
rename from docs/lang/articles/basic/layout.md
rename to docs/source/lang/articles/basic/layout.md
diff --git a/docs/lang/articles/basic/ndarray.md b/docs/source/lang/articles/basic/ndarray.md
similarity index 100%
rename from docs/lang/articles/basic/ndarray.md
rename to docs/source/lang/articles/basic/ndarray.md
diff --git a/docs/lang/articles/basic/offset.md b/docs/source/lang/articles/basic/offset.md
similarity index 100%
rename from docs/lang/articles/basic/offset.md
rename to docs/source/lang/articles/basic/offset.md
diff --git a/docs/lang/articles/basic/sparse.md b/docs/source/lang/articles/basic/sparse.md
similarity index 100%
rename from docs/lang/articles/basic/sparse.md
rename to docs/source/lang/articles/basic/sparse.md
diff --git a/docs/lang/articles/c-api/_category_.json b/docs/source/lang/articles/c-api/_category_.json
similarity index 100%
rename from docs/lang/articles/c-api/_category_.json
rename to docs/source/lang/articles/c-api/_category_.json
diff --git a/docs/lang/articles/c-api/taichi_core.md b/docs/source/lang/articles/c-api/taichi_core.md
similarity index 100%
rename from docs/lang/articles/c-api/taichi_core.md
rename to docs/source/lang/articles/c-api/taichi_core.md
diff --git a/docs/lang/articles/c-api/taichi_vulkan.md b/docs/source/lang/articles/c-api/taichi_vulkan.md
similarity index 100%
rename from docs/lang/articles/c-api/taichi_vulkan.md
rename to docs/source/lang/articles/c-api/taichi_vulkan.md
diff --git a/docs/lang/articles/contribution/_category_.json b/docs/source/lang/articles/contribution/_category_.json
similarity index 100%
rename from docs/lang/articles/contribution/_category_.json
rename to docs/source/lang/articles/contribution/_category_.json
diff --git a/docs/lang/articles/contribution/contributor_guide.md b/docs/source/lang/articles/contribution/contributor_guide.md
similarity index 100%
rename from docs/lang/articles/contribution/contributor_guide.md
rename to docs/source/lang/articles/contribution/contributor_guide.md
diff --git a/docs/lang/articles/contribution/dev_install.md b/docs/source/lang/articles/contribution/dev_install.md
similarity index 100%
rename from docs/lang/articles/contribution/dev_install.md
rename to docs/source/lang/articles/contribution/dev_install.md
diff --git a/docs/lang/articles/contribution/developer_utilities.md b/docs/source/lang/articles/contribution/developer_utilities.md
similarity index 100%
rename from docs/lang/articles/contribution/developer_utilities.md
rename to docs/source/lang/articles/contribution/developer_utilities.md
diff --git a/docs/lang/articles/contribution/development_tips.md b/docs/source/lang/articles/contribution/development_tips.md
similarity index 100%
rename from docs/lang/articles/contribution/development_tips.md
rename to docs/source/lang/articles/contribution/development_tips.md
diff --git a/docs/lang/articles/contribution/doc_writing.md b/docs/source/lang/articles/contribution/doc_writing.md
similarity index 100%
rename from docs/lang/articles/contribution/doc_writing.md
rename to docs/source/lang/articles/contribution/doc_writing.md
diff --git a/docs/lang/articles/contribution/style_guide_en.md b/docs/source/lang/articles/contribution/style_guide_en.md
similarity index 100%
rename from docs/lang/articles/contribution/style_guide_en.md
rename to docs/source/lang/articles/contribution/style_guide_en.md
diff --git a/docs/lang/articles/contribution/windows_debug.md b/docs/source/lang/articles/contribution/windows_debug.md
similarity index 100%
rename from docs/lang/articles/contribution/windows_debug.md
rename to docs/source/lang/articles/contribution/windows_debug.md
diff --git a/docs/lang/articles/contribution/write_test.md b/docs/source/lang/articles/contribution/write_test.md
similarity index 100%
rename from docs/lang/articles/contribution/write_test.md
rename to docs/source/lang/articles/contribution/write_test.md
diff --git a/docs/lang/articles/contribution/writing_cpp_tests.md b/docs/source/lang/articles/contribution/writing_cpp_tests.md
similarity index 100%
rename from docs/lang/articles/contribution/writing_cpp_tests.md
rename to docs/source/lang/articles/contribution/writing_cpp_tests.md
diff --git a/docs/lang/articles/debug/_category_.json b/docs/source/lang/articles/debug/_category_.json
similarity index 100%
rename from docs/lang/articles/debug/_category_.json
rename to docs/source/lang/articles/debug/_category_.json
diff --git a/docs/lang/articles/debug/debugging.md b/docs/source/lang/articles/debug/debugging.md
similarity index 100%
rename from docs/lang/articles/debug/debugging.md
rename to docs/source/lang/articles/debug/debugging.md
diff --git a/docs/lang/articles/deployment/_category_.json b/docs/source/lang/articles/deployment/_category_.json
similarity index 100%
rename from docs/lang/articles/deployment/_category_.json
rename to docs/source/lang/articles/deployment/_category_.json
diff --git a/docs/lang/articles/deployment/tutorial.md b/docs/source/lang/articles/deployment/tutorial.md
similarity index 100%
rename from docs/lang/articles/deployment/tutorial.md
rename to docs/source/lang/articles/deployment/tutorial.md
diff --git a/docs/lang/articles/differentiable/_category_.json b/docs/source/lang/articles/differentiable/_category_.json
similarity index 100%
rename from docs/lang/articles/differentiable/_category_.json
rename to docs/source/lang/articles/differentiable/_category_.json
diff --git a/docs/lang/articles/differentiable/differentiable_programming.md b/docs/source/lang/articles/differentiable/differentiable_programming.md
similarity index 100%
rename from docs/lang/articles/differentiable/differentiable_programming.md
rename to docs/source/lang/articles/differentiable/differentiable_programming.md
diff --git a/docs/lang/articles/faqs/_category_.json b/docs/source/lang/articles/faqs/_category_.json
similarity index 100%
rename from docs/lang/articles/faqs/_category_.json
rename to docs/source/lang/articles/faqs/_category_.json
diff --git a/docs/lang/articles/faqs/faq.md b/docs/source/lang/articles/faqs/faq.md
old mode 100755
new mode 100644
similarity index 100%
rename from docs/lang/articles/faqs/faq.md
rename to docs/source/lang/articles/faqs/faq.md
diff --git a/docs/lang/articles/faqs/install.md b/docs/source/lang/articles/faqs/install.md
similarity index 100%
rename from docs/lang/articles/faqs/install.md
rename to docs/source/lang/articles/faqs/install.md
diff --git a/docs/lang/articles/get-started/_category_.json b/docs/source/lang/articles/get-started/_category_.json
similarity index 100%
rename from docs/lang/articles/get-started/_category_.json
rename to docs/source/lang/articles/get-started/_category_.json
diff --git a/docs/lang/articles/get-started/accelerate_python.md b/docs/source/lang/articles/get-started/accelerate_python.md
similarity index 100%
rename from docs/lang/articles/get-started/accelerate_python.md
rename to docs/source/lang/articles/get-started/accelerate_python.md
diff --git a/docs/lang/articles/get-started/accelerate_pytorch.md b/docs/source/lang/articles/get-started/accelerate_pytorch.md
similarity index 100%
rename from docs/lang/articles/get-started/accelerate_pytorch.md
rename to docs/source/lang/articles/get-started/accelerate_pytorch.md
diff --git a/docs/lang/articles/get-started/cloth_simulation.md b/docs/source/lang/articles/get-started/cloth_simulation.md
similarity index 100%
rename from docs/lang/articles/get-started/cloth_simulation.md
rename to docs/source/lang/articles/get-started/cloth_simulation.md
diff --git a/docs/lang/articles/get-started/hello_world.md b/docs/source/lang/articles/get-started/hello_world.md
similarity index 100%
rename from docs/lang/articles/get-started/hello_world.md
rename to docs/source/lang/articles/get-started/hello_world.md
diff --git a/docs/lang/articles/glossary/_category_.json b/docs/source/lang/articles/glossary/_category_.json
similarity index 100%
rename from docs/lang/articles/glossary/_category_.json
rename to docs/source/lang/articles/glossary/_category_.json
diff --git a/docs/lang/articles/glossary/glossary.md b/docs/source/lang/articles/glossary/glossary.md
similarity index 100%
rename from docs/lang/articles/glossary/glossary.md
rename to docs/source/lang/articles/glossary/glossary.md
diff --git a/docs/lang/articles/internals/_category_.json b/docs/source/lang/articles/internals/_category_.json
similarity index 100%
rename from docs/lang/articles/internals/_category_.json
rename to docs/source/lang/articles/internals/_category_.json
diff --git a/docs/lang/articles/internals/compilation.md b/docs/source/lang/articles/internals/compilation.md
similarity index 100%
rename from docs/lang/articles/internals/compilation.md
rename to docs/source/lang/articles/internals/compilation.md
diff --git a/docs/lang/articles/internals/internal.md b/docs/source/lang/articles/internals/internal.md
similarity index 100%
rename from docs/lang/articles/internals/internal.md
rename to docs/source/lang/articles/internals/internal.md
diff --git a/docs/lang/articles/internals/life_of_kernel_lowres.jpg b/docs/source/lang/articles/internals/life_of_kernel_lowres.jpg
similarity index 100%
rename from docs/lang/articles/internals/life_of_kernel_lowres.jpg
rename to docs/source/lang/articles/internals/life_of_kernel_lowres.jpg
diff --git a/docs/lang/articles/kernels/_category_.json b/docs/source/lang/articles/kernels/_category_.json
similarity index 100%
rename from docs/lang/articles/kernels/_category_.json
rename to docs/source/lang/articles/kernels/_category_.json
diff --git a/docs/lang/articles/kernels/kernel_function.md b/docs/source/lang/articles/kernels/kernel_function.md
similarity index 100%
rename from docs/lang/articles/kernels/kernel_function.md
rename to docs/source/lang/articles/kernels/kernel_function.md
diff --git a/docs/lang/articles/kernels/kernel_sync.md b/docs/source/lang/articles/kernels/kernel_sync.md
similarity index 100%
rename from docs/lang/articles/kernels/kernel_sync.md
rename to docs/source/lang/articles/kernels/kernel_sync.md
diff --git a/docs/lang/articles/math/_category_.json b/docs/source/lang/articles/math/_category_.json
similarity index 100%
rename from docs/lang/articles/math/_category_.json
rename to docs/source/lang/articles/math/_category_.json
diff --git a/docs/lang/articles/math/linear_solver.md b/docs/source/lang/articles/math/linear_solver.md
similarity index 100%
rename from docs/lang/articles/math/linear_solver.md
rename to docs/source/lang/articles/math/linear_solver.md
diff --git a/docs/lang/articles/math/math_module.md b/docs/source/lang/articles/math/math_module.md
similarity index 100%
rename from docs/lang/articles/math/math_module.md
rename to docs/source/lang/articles/math/math_module.md
diff --git a/docs/lang/articles/math/sparse_matrix.md b/docs/source/lang/articles/math/sparse_matrix.md
similarity index 100%
rename from docs/lang/articles/math/sparse_matrix.md
rename to docs/source/lang/articles/math/sparse_matrix.md
diff --git a/docs/lang/articles/performance_tuning/_category_.json b/docs/source/lang/articles/performance_tuning/_category_.json
similarity index 100%
rename from docs/lang/articles/performance_tuning/_category_.json
rename to docs/source/lang/articles/performance_tuning/_category_.json
diff --git a/docs/lang/articles/performance_tuning/performance.md b/docs/source/lang/articles/performance_tuning/performance.md
similarity index 100%
rename from docs/lang/articles/performance_tuning/performance.md
rename to docs/source/lang/articles/performance_tuning/performance.md
diff --git a/docs/lang/articles/performance_tuning/profiler.md b/docs/source/lang/articles/performance_tuning/profiler.md
similarity index 100%
rename from docs/lang/articles/performance_tuning/profiler.md
rename to docs/source/lang/articles/performance_tuning/profiler.md
diff --git a/docs/lang/articles/reference/_category_.json b/docs/source/lang/articles/reference/_category_.json
similarity index 100%
rename from docs/lang/articles/reference/_category_.json
rename to docs/source/lang/articles/reference/_category_.json
diff --git a/docs/lang/articles/reference/differences_between_taichi_and_python_programs.md b/docs/source/lang/articles/reference/differences_between_taichi_and_python_programs.md
similarity index 100%
rename from docs/lang/articles/reference/differences_between_taichi_and_python_programs.md
rename to docs/source/lang/articles/reference/differences_between_taichi_and_python_programs.md
diff --git a/docs/lang/articles/reference/global_settings.md b/docs/source/lang/articles/reference/global_settings.md
similarity index 100%
rename from docs/lang/articles/reference/global_settings.md
rename to docs/source/lang/articles/reference/global_settings.md
diff --git a/docs/lang/articles/reference/language_reference.md b/docs/source/lang/articles/reference/language_reference.md
similarity index 100%
rename from docs/lang/articles/reference/language_reference.md
rename to docs/source/lang/articles/reference/language_reference.md
diff --git a/docs/lang/articles/reference/operator.md b/docs/source/lang/articles/reference/operator.md
similarity index 100%
rename from docs/lang/articles/reference/operator.md
rename to docs/source/lang/articles/reference/operator.md
diff --git a/docs/lang/articles/reference/simt.md b/docs/source/lang/articles/reference/simt.md
similarity index 100%
rename from docs/lang/articles/reference/simt.md
rename to docs/source/lang/articles/reference/simt.md
diff --git a/docs/lang/articles/reference/syntax_sugars.md b/docs/source/lang/articles/reference/syntax_sugars.md
similarity index 100%
rename from docs/lang/articles/reference/syntax_sugars.md
rename to docs/source/lang/articles/reference/syntax_sugars.md
diff --git a/docs/lang/articles/static/assets/aot_tutorial.png b/docs/source/lang/articles/static/assets/aot_tutorial.png
similarity index 100%
rename from docs/lang/articles/static/assets/aot_tutorial.png
rename to docs/source/lang/articles/static/assets/aot_tutorial.png
diff --git a/docs/lang/articles/static/assets/arrow_field.png b/docs/source/lang/articles/static/assets/arrow_field.png
similarity index 100%
rename from docs/lang/articles/static/assets/arrow_field.png
rename to docs/source/lang/articles/static/assets/arrow_field.png
diff --git a/docs/lang/articles/static/assets/arrows.png b/docs/source/lang/articles/static/assets/arrows.png
similarity index 100%
rename from docs/lang/articles/static/assets/arrows.png
rename to docs/source/lang/articles/static/assets/arrows.png
diff --git a/docs/lang/articles/static/assets/bitpacked_fields_layout_example.png b/docs/source/lang/articles/static/assets/bitpacked_fields_layout_example.png
similarity index 100%
rename from docs/lang/articles/static/assets/bitpacked_fields_layout_example.png
rename to docs/source/lang/articles/static/assets/bitpacked_fields_layout_example.png
diff --git a/docs/lang/articles/static/assets/bls_indices_mapping.png b/docs/source/lang/articles/static/assets/bls_indices_mapping.png
similarity index 100%
rename from docs/lang/articles/static/assets/bls_indices_mapping.png
rename to docs/source/lang/articles/static/assets/bls_indices_mapping.png
diff --git a/docs/lang/articles/static/assets/circles.png b/docs/source/lang/articles/static/assets/circles.png
similarity index 100%
rename from docs/lang/articles/static/assets/circles.png
rename to docs/source/lang/articles/static/assets/circles.png
diff --git a/docs/lang/articles/static/assets/colored_circles.png b/docs/source/lang/articles/static/assets/colored_circles.png
similarity index 100%
rename from docs/lang/articles/static/assets/colored_circles.png
rename to docs/source/lang/articles/static/assets/colored_circles.png
diff --git a/docs/lang/articles/static/assets/effect_of_offline_cache.png b/docs/source/lang/articles/static/assets/effect_of_offline_cache.png
similarity index 100%
rename from docs/lang/articles/static/assets/effect_of_offline_cache.png
rename to docs/source/lang/articles/static/assets/effect_of_offline_cache.png
diff --git a/docs/lang/articles/static/assets/floating-point_formats.png b/docs/source/lang/articles/static/assets/floating-point_formats.png
similarity index 100%
rename from docs/lang/articles/static/assets/floating-point_formats.png
rename to docs/source/lang/articles/static/assets/floating-point_formats.png
diff --git a/docs/lang/articles/static/assets/fractal.png b/docs/source/lang/articles/static/assets/fractal.png
similarity index 100%
rename from docs/lang/articles/static/assets/fractal.png
rename to docs/source/lang/articles/static/assets/fractal.png
diff --git a/docs/lang/articles/static/assets/lines.png b/docs/source/lang/articles/static/assets/lines.png
similarity index 100%
rename from docs/lang/articles/static/assets/lines.png
rename to docs/source/lang/articles/static/assets/lines.png
diff --git a/docs/lang/articles/static/assets/point_field.png b/docs/source/lang/articles/static/assets/point_field.png
similarity index 100%
rename from docs/lang/articles/static/assets/point_field.png
rename to docs/source/lang/articles/static/assets/point_field.png
diff --git a/docs/lang/articles/static/assets/quant_array_layout_example.png b/docs/source/lang/articles/static/assets/quant_array_layout_example.png
similarity index 100%
rename from docs/lang/articles/static/assets/quant_array_layout_example.png
rename to docs/source/lang/articles/static/assets/quant_array_layout_example.png
diff --git a/docs/lang/articles/static/assets/rect.png b/docs/source/lang/articles/static/assets/rect.png
similarity index 100%
rename from docs/lang/articles/static/assets/rect.png
rename to docs/source/lang/articles/static/assets/rect.png
diff --git a/docs/lang/articles/static/assets/runtime.png b/docs/source/lang/articles/static/assets/runtime.png
similarity index 100%
rename from docs/lang/articles/static/assets/runtime.png
rename to docs/source/lang/articles/static/assets/runtime.png
diff --git a/docs/lang/articles/static/assets/triangles.png b/docs/source/lang/articles/static/assets/triangles.png
similarity index 100%
rename from docs/lang/articles/static/assets/triangles.png
rename to docs/source/lang/articles/static/assets/triangles.png
diff --git a/docs/lang/articles/type_system/_category_.json b/docs/source/lang/articles/type_system/_category_.json
similarity index 100%
rename from docs/lang/articles/type_system/_category_.json
rename to docs/source/lang/articles/type_system/_category_.json
diff --git a/docs/lang/articles/type_system/type.md b/docs/source/lang/articles/type_system/type.md
similarity index 100%
rename from docs/lang/articles/type_system/type.md
rename to docs/source/lang/articles/type_system/type.md
diff --git a/docs/lang/articles/utilities/_category_.json b/docs/source/lang/articles/utilities/_category_.json
similarity index 100%
rename from docs/lang/articles/utilities/_category_.json
rename to docs/source/lang/articles/utilities/_category_.json
diff --git a/docs/lang/articles/visualization/_category_.json b/docs/source/lang/articles/visualization/_category_.json
similarity index 100%
rename from docs/lang/articles/visualization/_category_.json
rename to docs/source/lang/articles/visualization/_category_.json
diff --git a/docs/lang/articles/visualization/export_results.md b/docs/source/lang/articles/visualization/export_results.md
similarity index 100%
rename from docs/lang/articles/visualization/export_results.md
rename to docs/source/lang/articles/visualization/export_results.md
diff --git a/docs/lang/articles/visualization/ggui.md b/docs/source/lang/articles/visualization/ggui.md
similarity index 100%
rename from docs/lang/articles/visualization/ggui.md
rename to docs/source/lang/articles/visualization/ggui.md
diff --git a/docs/lang/articles/visualization/gui_system.md b/docs/source/lang/articles/visualization/gui_system.md
similarity index 100%
rename from docs/lang/articles/visualization/gui_system.md
rename to docs/source/lang/articles/visualization/gui_system.md
diff --git a/docs/rfcs/20220410-rfc-process.md b/docs/source/rfcs/20220410-rfc-process.md
similarity index 100%
rename from docs/rfcs/20220410-rfc-process.md
rename to docs/source/rfcs/20220410-rfc-process.md
diff --git a/docs/rfcs/20220413-aot-for-all-snode.md b/docs/source/rfcs/20220413-aot-for-all-snode.md
similarity index 100%
rename from docs/rfcs/20220413-aot-for-all-snode.md
rename to docs/source/rfcs/20220413-aot-for-all-snode.md
diff --git a/docs/rfcs/res/.gitkeep b/docs/source/rfcs/res/.gitkeep
similarity index 100%
rename from docs/rfcs/res/.gitkeep
rename to docs/source/rfcs/res/.gitkeep
diff --git a/docs/rfcs/yyyymmdd-rfc-template.md b/docs/source/rfcs/yyyymmdd-rfc-template.md
similarity index 100%
rename from docs/rfcs/yyyymmdd-rfc-template.md
rename to docs/source/rfcs/yyyymmdd-rfc-template.md
diff --git a/docs/variable.json b/docs/source/variable.json
similarity index 100%
rename from docs/variable.json
rename to docs/source/variable.json
diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
new file mode 100644
index 0000000000000..bf1dadd69b6b0
--- /dev/null
+++ b/docs/sphinx/_toc.yml.in
@@ -0,0 +1,29 @@
+defaults:
+  numbered: False
+  maxdepth: 6
+root: index
+subtrees:
+- entries:
+  - file: what-is-taichi.rst
+    title: What is Taichi Lang?
+
+- caption: Install
+  entries:
+  - file: install/taichi-install.rst
+    title: Install Taichi Lang
+
+- caption: Examples
+  entries:
+  - file: examples/taichi-get-started.rst 
+    title: Get started with Taichi Lang
+  - file: examples/taichi-examples.rst 
+    title: Run a Taichi Lang example
+
+- caption: About
+  entries:
+  - url: https://rocm.blogs.amd.com/artificial-intelligence/taichi_mi300x/README.html
+    title: ROCm Taichi Lang blog
+  - url: https://rocm.docs.amd.com/en/latest/contribute/contributing.html
+    title: Contribute to Taichi Lang
+  - file: about/license.rst
+    title: License
diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
new file mode 100644
index 0000000000000..63017d30fae98
--- /dev/null
+++ b/docs/sphinx/requirements.in
@@ -0,0 +1 @@
+rocm-docs-core==1.30.1
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
new file mode 100644
index 0000000000000..eb5559591ac59
--- /dev/null
+++ b/docs/sphinx/requirements.txt
@@ -0,0 +1,276 @@
+#
+# This file is autogenerated by pip-compile with Python 3.12
+# by the following command:
+#
+#    pip-compile sphinx/requirements.in
+#
+accessible-pygments==0.0.5
+    # via pydata-sphinx-theme
+alabaster==1.0.0
+    # via sphinx
+asttokens==3.0.1
+    # via stack-data
+attrs==25.4.0
+    # via
+    #   jsonschema
+    #   jupyter-cache
+    #   referencing
+babel==2.17.0
+    # via
+    #   pydata-sphinx-theme
+    #   sphinx
+beautifulsoup4==4.14.2
+    # via pydata-sphinx-theme
+breathe==4.36.0
+    # via rocm-docs-core
+certifi==2025.11.12
+    # via requests
+cffi==2.0.0
+    # via
+    #   cryptography
+    #   pynacl
+charset-normalizer==3.4.4
+    # via requests
+click==8.3.1
+    # via
+    #   jupyter-cache
+    #   sphinx-external-toc
+colorama==0.4.6
+    # via
+    #   click
+    #   ipython
+    #   sphinx
+comm==0.2.3
+    # via ipykernel
+cryptography==46.0.3
+    # via pyjwt
+debugpy==1.8.17
+    # via ipykernel
+decorator==5.2.1
+    # via ipython
+docutils==0.21.2
+    # via
+    #   myst-parser
+    #   pydata-sphinx-theme
+    #   sphinx
+executing==2.2.1
+    # via stack-data
+fastjsonschema==2.21.2
+    # via
+    #   nbformat
+    #   rocm-docs-core
+gitdb==4.0.12
+    # via gitpython
+gitpython==3.1.45
+    # via rocm-docs-core
+greenlet==3.2.4
+    # via sqlalchemy
+idna==3.11
+    # via requests
+imagesize==1.4.1
+    # via sphinx
+importlib-metadata==8.7.0
+    # via
+    #   jupyter-cache
+    #   myst-nb
+ipykernel==7.1.0
+    # via myst-nb
+ipython==9.7.0
+    # via
+    #   ipykernel
+    #   myst-nb
+ipython-pygments-lexers==1.1.1
+    # via ipython
+jedi==0.19.2
+    # via ipython
+jinja2==3.1.6
+    # via
+    #   myst-parser
+    #   sphinx
+jsonschema==4.25.1
+    # via nbformat
+jsonschema-specifications==2025.9.1
+    # via jsonschema
+jupyter-cache==1.0.1
+    # via myst-nb
+jupyter-client==8.6.3
+    # via
+    #   ipykernel
+    #   nbclient
+jupyter-core==5.9.1
+    # via
+    #   ipykernel
+    #   jupyter-client
+    #   nbclient
+    #   nbformat
+markdown-it-py==3.0.0
+    # via
+    #   mdit-py-plugins
+    #   myst-parser
+markupsafe==3.0.3
+    # via jinja2
+matplotlib-inline==0.2.1
+    # via
+    #   ipykernel
+    #   ipython
+mdit-py-plugins==0.5.0
+    # via myst-parser
+mdurl==0.1.2
+    # via markdown-it-py
+myst-nb==1.3.0
+    # via rocm-docs-core
+myst-parser==4.0.1
+    # via myst-nb
+nbclient==0.10.2
+    # via
+    #   jupyter-cache
+    #   myst-nb
+nbformat==5.10.4
+    # via
+    #   jupyter-cache
+    #   myst-nb
+    #   nbclient
+nest-asyncio==1.6.0
+    # via ipykernel
+packaging==25.0
+    # via
+    #   ipykernel
+    #   sphinx
+parso==0.8.5
+    # via jedi
+platformdirs==4.5.0
+    # via jupyter-core
+prompt-toolkit==3.0.52
+    # via ipython
+psutil==7.1.3
+    # via ipykernel
+pure-eval==0.2.3
+    # via stack-data
+pycparser==2.23
+    # via cffi
+pydata-sphinx-theme==0.16.1
+    # via
+    #   rocm-docs-core
+    #   sphinx-book-theme
+pygithub==2.8.1
+    # via rocm-docs-core
+pygments==2.19.2
+    # via
+    #   accessible-pygments
+    #   ipython
+    #   ipython-pygments-lexers
+    #   pydata-sphinx-theme
+    #   sphinx
+pyjwt[crypto]==2.10.1
+    # via pygithub
+pynacl==1.6.1
+    # via pygithub
+python-dateutil==2.9.0.post0
+    # via jupyter-client
+pyyaml==6.0.3
+    # via
+    #   jupyter-cache
+    #   myst-nb
+    #   myst-parser
+    #   rocm-docs-core
+    #   sphinx-external-toc
+pyzmq==27.1.0
+    # via
+    #   ipykernel
+    #   jupyter-client
+referencing==0.37.0
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+requests==2.32.5
+    # via
+    #   pygithub
+    #   sphinx
+rocm-docs-core==1.30.1
+    # via -r sphinx/requirements.in
+roman-numerals-py==3.1.0
+    # via sphinx
+rpds-py==0.29.0
+    # via
+    #   jsonschema
+    #   referencing
+six==1.17.0
+    # via python-dateutil
+smmap==5.0.2
+    # via gitdb
+snowballstemmer==3.0.1
+    # via sphinx
+soupsieve==2.8
+    # via beautifulsoup4
+sphinx==8.2.3
+    # via
+    #   breathe
+    #   myst-nb
+    #   myst-parser
+    #   pydata-sphinx-theme
+    #   rocm-docs-core
+    #   sphinx-book-theme
+    #   sphinx-copybutton
+    #   sphinx-design
+    #   sphinx-external-toc
+    #   sphinx-notfound-page
+    #   sphinx-reredirects
+sphinx-book-theme==1.1.3
+    # via rocm-docs-core
+sphinx-copybutton==0.5.2
+    # via rocm-docs-core
+sphinx-design==0.6.1
+    # via rocm-docs-core
+sphinx-external-toc==1.0.1
+    # via rocm-docs-core
+sphinx-notfound-page==1.1.0
+    # via rocm-docs-core
+sphinx-reredirects==1.0.0
+    # via -r sphinx/requirements.in
+sphinxcontrib-applehelp==2.0.0
+    # via sphinx
+sphinxcontrib-devhelp==2.0.0
+    # via sphinx
+sphinxcontrib-htmlhelp==2.1.0
+    # via sphinx
+sphinxcontrib-jsmath==1.0.1
+    # via sphinx
+sphinxcontrib-qthelp==2.0.0
+    # via sphinx
+sphinxcontrib-serializinghtml==2.0.0
+    # via sphinx
+sqlalchemy==2.0.44
+    # via jupyter-cache
+stack-data==0.6.3
+    # via ipython
+tabulate==0.9.0
+    # via jupyter-cache
+tornado==6.5.2
+    # via
+    #   ipykernel
+    #   jupyter-client
+traitlets==5.14.3
+    # via
+    #   ipykernel
+    #   ipython
+    #   jupyter-client
+    #   jupyter-core
+    #   matplotlib-inline
+    #   nbclient
+    #   nbformat
+typing-extensions==4.15.0
+    # via
+    #   beautifulsoup4
+    #   myst-nb
+    #   pydata-sphinx-theme
+    #   pygithub
+    #   referencing
+    #   sqlalchemy
+urllib3==2.5.0
+    # via
+    #   pygithub
+    #   requests
+wcwidth==0.2.14
+    # via prompt-toolkit
+zipp==3.23.0
+    # via importlib-metadata
diff --git a/docs/what-is-taichi.rst b/docs/what-is-taichi.rst
new file mode 100644
index 0000000000000..7165ffe4d6f2a
--- /dev/null
+++ b/docs/what-is-taichi.rst
@@ -0,0 +1,55 @@
+.. meta::
+  :description: What is Taichi Lang?
+  :keywords: Taichi Lang, python, programming, JIT, AMD, ROCm, overview, introduction
+
+.. _what-is-taichi:
+
+********************************************************************
+What is Taichi Lang?
+********************************************************************
+
+Taichi Lang is an open-source, imperative, and parallel programming language embedded
+in Python, designed for high-performance numerical computation and real-time physical
+simulation. It uses just-in-time (JIT) compilation frameworks such as LLVM to accelerate
+compute-intensive Python code by compiling it into optimized GPU or CPU instructions.
+
+Features and use cases
+====================================================================
+
+Taichi Lang allows developers to write concise, high-level algorithms while leaving
+performance optimization to Taichi’s compiler. Taichi Lang is widely used in domains such
+as fluid dynamics, particle-based simulations, robotics, computer vision, augmented
+reality, artificial intelligence, and visual effects for film and gaming. For example,
+simulating a cloth falling onto a sphere, a system with tens of thousands of mass
+points and springs, can be implemented in Taichi Lang with only a few dozen lines of
+Python code due to its data-oriented design and automatic parallelization.
+
+Taichi Lang also supports advanced techniques such as hierarchical sparse voxel grids for
+large-scale simulations, enabling efficient handling of spatially sparse data structures
+in 3D visual computing. On ROCm, Taichi Lang is officially supported on AMD Instinct
+GPUs, making it a powerful tool for developers who need both flexibility and performance.
+
+
+Why Taichi Lang?
+====================================================================
+
+- **Built around Python**: Taichi Lang shares almost the same syntax with Python, 
+  which lets you write algorithms with minimal syntax differences. It is also well
+  integrated into the Python ecosystem, including NumPy and PyTorch.
+
+- **Flexibility**: Taichi Lang provides a set of generic data containers known 
+  as ``SNode``, an effective mechanism for composing hierarchical, 
+  multi-dimensional fields. This can cover many use patterns in numerical simulation
+  (for example, `spatially sparse computing <https://docs.taichi-lang.org/docs/sparse>`__).
+
+- **Performance**: With the ``ti.kernel`` decorator, Taichi Lang's JIT compiler 
+  automatically compiles your Python functions into efficient GPU or CPU machine
+  code for parallel execution.
+
+- **Portability**: Write your code once and run it everywhere. Taichi 
+  Lang supports most mainstream GPU APIs, such as NVIDIA CUDA and Vulkan. You can
+  write your code with Taichi Lang on ROCm and use it extensibly.
+
+- **Additional feature support**: Cross-platform support that includes a Vulkan-based 3D visualizer,
+  `differentiable programming <https://docs.taichi-lang.org/docs/differentiable_programming>`__,
+  `quantized computation <https://github.com/taichi-dev/quantaichi>`__ (experimental), and many more.
diff --git a/external/SPIRV-Cross b/external/SPIRV-Cross
index c77b09b57c278..131278458ea8e 160000
--- a/external/SPIRV-Cross
+++ b/external/SPIRV-Cross
@@ -1 +1 @@
-Subproject commit c77b09b57c27837dc2d41aa371ed3d236ce9ce47
+Subproject commit 131278458ea8eebe6a6e9c476fbcf71278726e1a
diff --git a/external/SPIRV-Headers b/external/SPIRV-Headers
index 34d04647d384e..b2a156e1c0434 160000
--- a/external/SPIRV-Headers
+++ b/external/SPIRV-Headers
@@ -1 +1 @@
-Subproject commit 34d04647d384e0aed037e7a2662a655fc39841bb
+Subproject commit b2a156e1c0434bc8c99aaebba1c7be98be7ac580
diff --git a/external/SPIRV-Reflect b/external/SPIRV-Reflect
index 7c9c841fa9f40..1aceb6af56e74 160000
--- a/external/SPIRV-Reflect
+++ b/external/SPIRV-Reflect
@@ -1 +1 @@
-Subproject commit 7c9c841fa9f40c09d334d5db6629ba318e46efaf
+Subproject commit 1aceb6af56e74b92a00378842dda5c5a73f49a4b
diff --git a/external/SPIRV-Tools b/external/SPIRV-Tools
index 46ca66e6991f1..5e61ea2098220 160000
--- a/external/SPIRV-Tools
+++ b/external/SPIRV-Tools
@@ -1 +1 @@
-Subproject commit 46ca66e6991f16c89e17ebc9b86995143be2c706
+Subproject commit 5e61ea2098220059e89523f1f47b0bcd8c33b89a
diff --git a/external/Vulkan-Headers b/external/Vulkan-Headers
index 409c16be502e3..76f00ef6cbb18 160000
--- a/external/Vulkan-Headers
+++ b/external/Vulkan-Headers
@@ -1 +1 @@
-Subproject commit 409c16be502e39fe70dd6fe2d9ad4842ef2c9a53
+Subproject commit 76f00ef6cbb1886eb1162d1fa39bee8b51e22ee8
diff --git a/external/VulkanMemoryAllocator b/external/VulkanMemoryAllocator
index 539c0a8d8e373..a88bc520a08d5 160000
--- a/external/VulkanMemoryAllocator
+++ b/external/VulkanMemoryAllocator
@@ -1 +1 @@
-Subproject commit 539c0a8d8e3733c9f25ea9a184c85c77504f1653
+Subproject commit a88bc520a08d574b5b5363e56d6a5b4d106b77bc
diff --git a/external/amdgpu_libdevice/oclc_isa_version_940.bc b/external/amdgpu_libdevice/oclc_isa_version_940.bc
new file mode 100644
index 0000000000000..dd3c79606ca6d
Binary files /dev/null and b/external/amdgpu_libdevice/oclc_isa_version_940.bc differ
diff --git a/external/amdgpu_libdevice/oclc_isa_version_941.bc b/external/amdgpu_libdevice/oclc_isa_version_941.bc
new file mode 100644
index 0000000000000..f9bc8f503dbfa
Binary files /dev/null and b/external/amdgpu_libdevice/oclc_isa_version_941.bc differ
diff --git a/external/amdgpu_libdevice/oclc_isa_version_942.bc b/external/amdgpu_libdevice/oclc_isa_version_942.bc
new file mode 100644
index 0000000000000..8c9446e4ed30e
Binary files /dev/null and b/external/amdgpu_libdevice/oclc_isa_version_942.bc differ
diff --git a/external/amdgpu_libdevice/oclc_isa_version_950.bc b/external/amdgpu_libdevice/oclc_isa_version_950.bc
new file mode 100644
index 0000000000000..499d64a65d991
Binary files /dev/null and b/external/amdgpu_libdevice/oclc_isa_version_950.bc differ
diff --git a/external/assets b/external/assets
index 2905391325512..150b16ad12ad5 160000
--- a/external/assets
+++ b/external/assets
@@ -1 +1 @@
-Subproject commit 2905391325512f58adb3f8684bafc06ef29f8e47
+Subproject commit 150b16ad12ad58a9a93b8988ded913e632a4df4f
diff --git a/external/backward_cpp b/external/backward_cpp
index 51f0700452cf7..647eccde8e87d 160000
--- a/external/backward_cpp
+++ b/external/backward_cpp
@@ -1 +1 @@
-Subproject commit 51f0700452cf71c57d43c2d028277b24cde32502
+Subproject commit 647eccde8e87d7669be1be8c661e26f1a78a3244
diff --git a/external/spdlog b/external/spdlog
index c3aed4b683739..cf6f1dd01e660 160000
--- a/external/spdlog
+++ b/external/spdlog
@@ -1 +1 @@
-Subproject commit c3aed4b68373955e1cc94307683d44dca1515d2b
+Subproject commit cf6f1dd01e660d5865d68bf5fa78f6376b89470a
diff --git a/external/volk b/external/volk
index b87f88292b09b..695b58e71c628 160000
--- a/external/volk
+++ b/external/volk
@@ -1 +1 @@
-Subproject commit b87f88292b09bc899b24028984186581a1d24c4e
+Subproject commit 695b58e71c628d6d50b28d0ab737f110ea696e04
diff --git a/python/taichi/CHANGELOG.md b/python/taichi/CHANGELOG.md
new file mode 100644
index 0000000000000..7b99f10104069
--- /dev/null
+++ b/python/taichi/CHANGELOG.md
@@ -0,0 +1,43 @@
+Highlights:
+   - **Build system**
+      - Revert "Drop OpenGL build (#8751)" (#8753) (by **Proton**)
+      - Drop OpenGL build (#8751) (by **Proton**)
+
+Full changelog:
+   - updating amd gpu kernel code generation to llvm 20 (by **Tiffany Mintz**)
+   - updated AMD Instinct GPU jit implementation to llvm 20 (by **Tiffany Mintz**)
+   - fix build issues with llvm 20 update (by **Tiffany Mintz**)
+   - additional updates for llvm 20 (by **Tiffany Mintz**)
+   - additional cuda updates for llvm20; merging from 8ca16de9a24e82baaed1deac80f71bd541c131ca to add2df35782768f5ad243cacef0922ec00890ba5 from johnnynunez/taichi master branch (by **Tiffany Mintz**)
+   - removing blackwell updates; restoring window_base.cpp include (by **Tiffany Mintz**)
+   - removing updates for blackwell (by **Tiffany Mintz**)
+   - implementing error function and cuda updates; merging 5449f722e796fac22d3b7a041488d77ac5e49b25 to 649c58d7a7991080e5dae510be72f7c6ad528001 from johnnynunez/taichi master branch; some of the changes from these were captured in the previous commit to rocm/taichi (by **Tiffany Mintz**)
+   - cmake build updates, header fixes; Merging from commits ebdc72be75443d8785703e0c8af1236be237fc04 to 9d140238ed1fa1f33b077a5fe74d61905b0167d4 from johnnynunez/taichi master branch (by **Tiffany Mintz**)
+   - Fix header include for program in codegen_cpu.cpp (by **johnnynunez**)
+   - Update header includes and fix LLVM API calls in CPU code generation (by **johnnynunez**)
+   - Refactor JIT compilation in CUDA: update function pointers, enhance PTX handling, and implement new pass manager setup (by **johnnynunez**)
+   - Fix include directive for IR analysis header in codegen_cuda.cpp (by **johnnynunez**)
+   - Add CHANGELOG.md to document recent updates and improvements (by **johnnynunez**)
+   - Update LLVM API calls in codegen_cuda.cpp for compatibility with recent changes (by **johnnynunez**)
+   - LLVM-20 (by **johnnynunez**)
+   - Merge branch 'amd-integration' into amd-develop (by **tmm77**)
+   - Merge pull request #4 from ROCm/master (by **tmm77**)
+   - Merge branch 'amd-develop' into master (by **tmm77**)
+   - Merge pull request #3 from taichi-dev/master (by **tmm77**)
+   - [ir] Sanitize SPIRV debugprintf string (#8752) (by **Proton**)
+   - [ci] Windows needs Vulkan SDK (#8754) (by **Proton**)
+   - [Build] Revert "Drop OpenGL build (#8751)" (#8753) (by **Proton**)
+   - [Build] Drop OpenGL build (#8751) (by **Proton**)
+   - [build] Update minimum requirement for CMAKE (#8703) (by **Linull/李林**)
+   - [build] Add Python 3.13 build (#8748) (by **Proton**)
+   - [build] Identify aarch64 wheels (#8716) (by **Johnny**)
+   - [build] Update setup function to use Miniforge and correct download URLs (#8719) (by **Johnny**)
+   - [build] Update Vulkan SDK version (#8749) (by **T.Yamada**)
+   - [build] Update sccache download URLs to version 0.10.0 for various plat… (#8717) (by **Johnny**)
+   - [build] Replace libtinfo-dev with libncurses-dev in package dependencies (#8718) (by **Johnny**)
+   - [ci] Migrate to new threading interface (#8731) (by **Emmanuel Ferdman**)
+   - ROCm port of taichi (by **Bhanu Kiran Atturu**)
+   - setting architecture to gpu (by **tmm77**)
+   - Taichi Multistage Dockerfile (by **Bhavesh Lad**)
+   - Taicho Multistage Dockerfile (by **Bhavesh Lad**)
+   - fix: Patch to avoid the need to fetch source to build Taichi wheel (by **Bhavesh Lad**)
diff --git a/python/taichi/examples/algorithm/laplace.py b/python/taichi/examples/algorithm/laplace.py
index 6b8bb9fb3c893..20c02713ac50a 100644
--- a/python/taichi/examples/algorithm/laplace.py
+++ b/python/taichi/examples/algorithm/laplace.py
@@ -1,6 +1,6 @@
 import taichi as ti
 
-ti.init(arch=ti.cpu)
+ti.init(arch=ti.gpu)
 
 N = 16
 
diff --git a/python/taichi/lang/ops.py b/python/taichi/lang/ops.py
index e259886d92002..0df416f239870 100644
--- a/python/taichi/lang/ops.py
+++ b/python/taichi/lang/ops.py
@@ -4,6 +4,7 @@
 from typing import Union
 
 import numpy as np
+import math
 from taichi._lib import core as _ti_core
 from taichi.lang import expr, impl
 from taichi.lang.exception import TaichiSyntaxError
@@ -555,6 +556,24 @@ def logical_not(a):
     """
     return _unary_operation(_ti_core.expr_logic_not, np.logical_not, a)
 
+def erf(a):
+    """The error function.
+    Args:
+        a (Union[:class:`~taichi.lang.expr.Expr`, :class:`~taichi.lang.matrix.Matrix`]): A number or a matrix.
+    Returns:
+        The error function of `a`.
+    """
+    return _unary_operation(_ti_core.expr_erf, math.erf, a)
+
+
+def erfc(a):
+    """The complement error function.
+    Args:
+        a (Union[:class:`~taichi.lang.expr.Expr`, :class:`~taichi.lang.matrix.Matrix`]): A number or a matrix.
+    Returns:
+        The complement error function of `a`.
+    """
+    return _unary_operation(_ti_core.expr_erfc, math.erfc, a)
 
 def random(dtype=float) -> Union[float, int]:
     """Return a single random float/integer according to the specified data type.
@@ -1483,4 +1502,6 @@ def min(*args):  # pylint: disable=W0622
     "select",
     "abs",
     "pow",
+    "erf",
+    "erfc",
 ]
diff --git a/python/taichi/math/mathimpl.py b/python/taichi/math/mathimpl.py
index 85fb4565a5629..2ed6ad214ee2b 100644
--- a/python/taichi/math/mathimpl.py
+++ b/python/taichi/math/mathimpl.py
@@ -25,6 +25,8 @@
     sqrt,
     tan,
     tanh,
+    erf,
+    erfc,
 )
 from taichi.types import matrix, template, vector
 from taichi.types.primitive_types import f64, u32, u64
@@ -874,6 +876,8 @@ def clz(x):
     "step",
     "tan",
     "tanh",
+    "erf",
+    "erfc",
     "uvec2",
     "uvec3",
     "uvec4",
diff --git a/readthedocs.yaml b/readthedocs.yaml
new file mode 100644
index 0000000000000..3db285f6f772c
--- /dev/null
+++ b/readthedocs.yaml
@@ -0,0 +1,14 @@
+version: 2
+
+sphinx:
+  configuration: docs/conf.py
+
+python:
+  install:
+    - requirements: docs/sphinx/requirements.txt
+
+build:
+   os: ubuntu-24.04
+   tools:
+      python: "3.12"
+
diff --git a/setup.py b/setup.py
index 45166bbf20d23..f8d09f40e5431 100644
--- a/setup.py
+++ b/setup.py
@@ -9,6 +9,7 @@
 import multiprocessing
 import os
 import platform
+import re
 import shutil
 import subprocess
 import sys
@@ -48,6 +49,8 @@ def get_version():
 project_name = os.getenv("PROJECT_NAME", "taichi")
 version = get_version()
 TI_VERSION_MAJOR, TI_VERSION_MINOR, TI_VERSION_PATCH = version.split(".")
+# Strip non-numeric suffix from patch version (e.g., "0b2" -> "0") for AMD compatibility versioning
+TI_VERSION_PATCH = re.sub(r'[^0-9].*$', '', TI_VERSION_PATCH)
 
 data_files = glob.glob("python/_lib/runtime/*")
 print(data_files)
diff --git a/spdlog_fmt.patch b/spdlog_fmt.patch
new file mode 100644
index 0000000000000..c451134aa2e38
--- /dev/null
+++ b/spdlog_fmt.patch
@@ -0,0 +1,41 @@
+diff --git a/include/spdlog/fmt/bundled/format.h b/include/spdlog/fmt/bundled/format.h
+index 01f41f5..8785c75 100644
+--- a/include/spdlog/fmt/bundled/format.h
++++ b/include/spdlog/fmt/bundled/format.h
+@@ -555,7 +555,7 @@ class u8string_view : public basic_string_view<char8_t> {
+ 
+ #if FMT_USE_USER_DEFINED_LITERALS
+ inline namespace literals {
+-inline u8string_view operator"" _u(const char* s, std::size_t n) {
++inline u8string_view operator""_u(const char* s, std::size_t n) {
+   return {s, n};
+ }
+ }  // namespace literals
+@@ -3467,11 +3467,11 @@ FMT_CONSTEXPR internal::udl_formatter<Char, CHARS...> operator""_format() {
+     std::string message = "The answer is {}"_format(42);
+   \endrst
+  */
+-FMT_CONSTEXPR internal::udl_formatter<char> operator"" _format(const char* s,
++FMT_CONSTEXPR internal::udl_formatter<char> operator""_format(const char* s,
+                                                                std::size_t n) {
+   return {{s, n}};
+ }
+-FMT_CONSTEXPR internal::udl_formatter<wchar_t> operator"" _format(
++FMT_CONSTEXPR internal::udl_formatter<wchar_t> operator""_format(
+     const wchar_t* s, std::size_t n) {
+   return {{s, n}};
+ }
+@@ -3487,11 +3487,11 @@ FMT_CONSTEXPR internal::udl_formatter<wchar_t> operator"" _format(
+     fmt::print("Elapsed time: {s:.2f} seconds", "s"_a=1.23);
+   \endrst
+  */
+-FMT_CONSTEXPR internal::udl_arg<char> operator"" _a(const char* s,
++FMT_CONSTEXPR internal::udl_arg<char> operator""_a(const char* s,
+                                                     std::size_t n) {
+   return {{s, n}};
+ }
+-FMT_CONSTEXPR internal::udl_arg<wchar_t> operator"" _a(const wchar_t* s,
++FMT_CONSTEXPR internal::udl_arg<wchar_t> operator""_a(const wchar_t* s,
+                                                        std::size_t n) {
+   return {{s, n}};
+ }
diff --git a/taichi/codegen/amdgpu/codegen_amdgpu.cpp b/taichi/codegen/amdgpu/codegen_amdgpu.cpp
index 5056334857552..cfecb422ebd5d 100644
--- a/taichi/codegen/amdgpu/codegen_amdgpu.cpp
+++ b/taichi/codegen/amdgpu/codegen_amdgpu.cpp
@@ -110,7 +110,11 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
         builder->CreateBr(bb_merge);
         bb_oeq_then = builder->GetInsertBlock();
 
+#if LLVM_VERSION_MAJOR >= 17
+	bb_oeq_else->insertInto(func);
+#else
         func->getBasicBlockList().push_back(bb_oeq_else);
+#endif
         builder->SetInsertPoint(bb_oeq_else);
         auto fcmp_olt = builder->CreateFCmpOLT(
             input,
@@ -125,7 +129,11 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
         builder->CreateBr(bb_merge);
         bb_olt_then = builder->GetInsertBlock();
 
+#if LLVM_VERSION_MAJOR >= 17
+        bb_olt_else->insertInto(func);
+#else
         func->getBasicBlockList().push_back(bb_olt_else);
+#endif
         builder->SetInsertPoint(bb_olt_else);
         builder->CreateStore(
             llvm::ConstantFP::get(llvm::Type::getFloatTy(*llvm_context), 1),
@@ -133,7 +141,11 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
         builder->CreateBr(bb_merge);
         bb_olt_else = builder->GetInsertBlock();
 
+#if LLVM_VERSION_MAJOR >= 17
+	bb_merge->insertInto(func);
+#else
         func->getBasicBlockList().push_back(bb_merge);
+#endif
         builder->SetInsertPoint(bb_merge);
         llvm_val[stmt] =
             builder->CreateLoad(llvm::Type::getFloatTy(*llvm_context), cast);
@@ -161,7 +173,11 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
         builder->CreateBr(bb_merge);
         bb_oeq_then = builder->GetInsertBlock();
 
+#if LLVM_VERSION_MAJOR >= 17
+	bb_oeq_else->insertInto(func);
+#else
         func->getBasicBlockList().push_back(bb_oeq_else);
+#endif
         builder->SetInsertPoint(bb_oeq_else);
         auto fcmp_olt = builder->CreateFCmpOLT(
             input,
@@ -176,7 +192,11 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
         builder->CreateBr(bb_merge);
         bb_olt_then = builder->GetInsertBlock();
 
+#if LLVM_VERSION_MAJOR >= 17
+	bb_olt_else->insertInto(func);
+#else
         func->getBasicBlockList().push_back(bb_olt_else);
+#endif
         builder->SetInsertPoint(bb_olt_else);
         builder->CreateStore(
             llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), 1),
@@ -184,7 +204,11 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
         builder->CreateBr(bb_merge);
         bb_olt_else = builder->GetInsertBlock();
 
+#if LLVM_VERSION_MAJOR >= 17
+	bb_merge->insertInto(func);
+#else
         func->getBasicBlockList().push_back(bb_merge);
+#endif
         builder->SetInsertPoint(bb_merge);
         llvm_val[stmt] =
             builder->CreateLoad(llvm::Type::getDoubleTy(*llvm_context), cast);
diff --git a/taichi/codegen/cpu/codegen_cpu.cpp b/taichi/codegen/cpu/codegen_cpu.cpp
index aa7ac005a4ac4..74e3a0438abc2 100644
--- a/taichi/codegen/cpu/codegen_cpu.cpp
+++ b/taichi/codegen/cpu/codegen_cpu.cpp
@@ -12,12 +12,17 @@
 #include "taichi/ir/analysis.h"
 #include "taichi/analysis/offline_cache_util.h"
 
-#include "llvm/Support/Host.h"
+#if LLVM_VERSION_MAJOR >= 16
+#include <llvm/TargetParser/Host.h>
+#else
+#include <llvm/Support/Host.h>
+#endif
+
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
+#include "llvm/Passes/PassBuilder.h"
 
 namespace taichi::lang {
 
@@ -53,7 +58,7 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM {
     {
       auto guard = get_function_creation_guard(
           {llvm::PointerType::get(get_runtime_type("RuntimeContext"), 0),
-           llvm::Type::getInt8PtrTy(*llvm_context),
+           llvm::PointerType::get(*llvm_context, 0),
            tlctx->get_data_type<int>()});
 
       auto loop_var = create_entry_block_alloca(PrimitiveType::i32);
@@ -81,7 +86,7 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM {
     {
       auto guard = get_function_creation_guard(
           {llvm::PointerType::get(get_runtime_type("RuntimeContext"), 0),
-           llvm::Type::getInt8PtrTy(*llvm_context),
+           llvm::PointerType::get(*llvm_context, 0),
            tlctx->get_data_type<int>()});
 
       for (int i = 0; i < stmt->mesh_prologue->size(); i++) {
@@ -155,7 +160,7 @@ class TaskCodeGenCPU : public TaskCodeGenLLVM {
         "bls_buffer", nullptr, llvm::GlobalVariable::LocalExecTLSModel, 0);
     /* module->getOrInsertGlobal("bls_buffer", type);
     bls_buffer = module->getNamedGlobal("bls_buffer");
-    bls_buffer->setAlignment(llvm::MaybeAlign(8));*/ // TODO(changyu): Fix JIT session error: Symbols not found: [ __emutls_get_address ] in python 3.10
+    bls_buffer->setAlignment(llvm::MaybeAlign(8));*/ //
 
     // initialize the variable with an undef value to ensure it is added to the
     // symbol table
@@ -226,7 +231,6 @@ static llvm::Triple get_host_target_triple() {
   }
   return expected_jtmb->getTargetTriple();
 }
-
 }  // namespace
 
 #ifdef TI_WITH_LLVM
@@ -262,76 +266,85 @@ void KernelCodeGenCPU::optimize_module(llvm::Module *module) {
     options.NoInfsFPMath = 0;
     options.NoNaNsFPMath = 0;
   }
+
   options.HonorSignDependentRoundingFPMathOption = false;
   options.NoZerosInBSS = false;
   options.GuaranteedTailCallOpt = false;
 
-  llvm::legacy::FunctionPassManager function_pass_manager(module);
-  llvm::legacy::PassManager module_pass_manager;
-
+#if LLVM_VERSION_MAJOR >= 18
+  const auto opt_level = llvm::CodeGenOptLevel::Aggressive;
+#else
+  const auto opt_level = llvm::CodeGenOpt::Aggressive;
+#endif
   llvm::StringRef mcpu = llvm::sys::getHostCPUName();
   std::unique_ptr<llvm::TargetMachine> target_machine(
       target->createTargetMachine(triple.str(), mcpu.str(), "", options,
                                   llvm::Reloc::PIC_, llvm::CodeModel::Small,
-                                  llvm::CodeGenOpt::Aggressive));
-
+                                  opt_level));
   TI_ERROR_UNLESS(target_machine.get(), "Could not allocate target machine!");
 
+  module->setTargetTriple(triple.str());
   module->setDataLayout(target_machine->createDataLayout());
 
-  module_pass_manager.add(llvm::createTargetTransformInfoWrapperPass(
-      target_machine->getTargetIRAnalysis()));
-  function_pass_manager.add(llvm::createTargetTransformInfoWrapperPass(
-      target_machine->getTargetIRAnalysis()));
+  // Create the new analysis manager
+  llvm::LoopAnalysisManager LAM;
+  llvm::FunctionAnalysisManager FAM;
+  llvm::CGSCCAnalysisManager CGAM;
+  llvm::ModuleAnalysisManager MAM;
 
-  llvm::PassManagerBuilder b;
-  b.OptLevel = 3;
-  b.Inliner = llvm::createFunctionInliningPass(b.OptLevel, 0, false);
-  b.LoopVectorize = true;
-  b.SLPVectorize = true;
+  // Create the new pass builder
+  llvm::PipelineTuningOptions PTO;
+  PTO.LoopInterleaving = true;
+  PTO.LoopVectorization = true;
+  PTO.SLPVectorization = true;
+  PTO.LoopUnrolling = true;
+  PTO.ForgetAllSCEVInLoopUnroll = true;
 
-  target_machine->adjustPassManager(b);
+  llvm::PassBuilder PB(target_machine.get(), PTO);
 
-  b.populateFunctionPassManager(function_pass_manager);
-  b.populateModulePassManager(module_pass_manager);
+  PB.registerModuleAnalyses(MAM);
+  PB.registerCGSCCAnalyses(CGAM);
+  PB.registerFunctionAnalyses(FAM);
+  PB.registerLoopAnalyses(LAM);
+  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
 
-  {
-    TI_PROFILER("llvm_function_pass");
-    function_pass_manager.doInitialization();
-    for (llvm::Module::iterator i = module->begin(); i != module->end(); i++)
-      function_pass_manager.run(*i);
+  target_machine->registerPassBuilderCallbacks(PB);
 
-    function_pass_manager.doFinalization();
-  }
-
-  /*
-    Optimization for llvm::GetElementPointer:
-    https://github.com/taichi-dev/taichi/issues/5472 The three other passes
-    "loop-reduce", "ind-vars", "cse" serves as preprocessing for
-    "separate-const-offset-gep".
-
-    Note there's an update for "separate-const-offset-gep" in llvm-12.
-  */
-  module_pass_manager.add(llvm::createLoopStrengthReducePass());
-  module_pass_manager.add(llvm::createIndVarSimplifyPass());
-  module_pass_manager.add(llvm::createSeparateConstOffsetFromGEPPass(false));
-  module_pass_manager.add(llvm::createEarlyCSEPass(true));
-
-  llvm::SmallString<8> outstr;
-  llvm::raw_svector_ostream ostream(outstr);
-  ostream.SetUnbuffered();
-  if (compile_config.print_kernel_asm) {
-    // Generate assembly code if neccesary
-    target_machine->addPassesToEmitFile(module_pass_manager, ostream, nullptr,
-                                        llvm::CGFT_AssemblyFile);
-  }
+  llvm::ModulePassManager MPM =
+      PB.buildPerModuleDefaultPipeline(llvm::OptimizationLevel::O3);
 
   {
     TI_PROFILER("llvm_module_pass");
-    module_pass_manager.run(*module);
+    MPM.run(*module, MAM);
+  }
+
+  if (llvm::verifyModule(*module, &llvm::errs())) {
+    module->print(llvm::errs(), nullptr);
+    TI_ERROR("LLVM Module broken");
   }
 
   if (compile_config.print_kernel_asm) {
+    llvm::SmallString<8> outstr;
+    llvm::raw_svector_ostream ostream(outstr);
+    ostream.SetUnbuffered();
+
+    llvm::legacy::PassManager LPM;
+    LPM.add(llvm::createTargetTransformInfoWrapperPass(
+        target_machine->getTargetIRAnalysis()));
+
+    // Override default to generate verbose assembly.
+    target_machine->Options.MCOptions.AsmVerbose = true;
+
+#if LLVM_VERSION_MAJOR >= 18
+    const auto file_type = llvm::CodeGenFileType::AssemblyFile;
+#else
+    const auto file_type = llvm::CGFT_AssemblyFile;
+#endif
+    bool fail =
+        target_machine->addPassesToEmitFile(LPM, ostream, nullptr, file_type);
+    TI_ERROR_IF(fail, "Failed to setup the CPU assembly writer");
+    LPM.run(*module);
+
     static FileSequenceWriter writer(
         "taichi_kernel_cpu_llvm_ir_optimized_asm_{:04d}.s",
         "optimized assembly code (CPU)");
@@ -344,12 +357,12 @@ void KernelCodeGenCPU::optimize_module(llvm::Module *module) {
       TI_INFO("Functions with > 100 instructions in optimized LLVM IR:");
       TaichiLLVMContext::print_huge_functions(module);
     }
+
     static FileSequenceWriter writer(
         "taichi_kernel_cpu_llvm_ir_optimized_{:04d}.ll",
         "optimized LLVM IR (CPU)");
     writer.write(module);
   }
 }
-
 #endif  // TI_WITH_LLVM
-}  // namespace taichi::lang
+}  // namespace taichi::lang
\ No newline at end of file
diff --git a/taichi/codegen/cuda/codegen_cuda.cpp b/taichi/codegen/cuda/codegen_cuda.cpp
index 04112e92bcae6..ce931bbffe74a 100644
--- a/taichi/codegen/cuda/codegen_cuda.cpp
+++ b/taichi/codegen/cuda/codegen_cuda.cpp
@@ -65,6 +65,7 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
                             const std::vector<llvm::Value *> &values) {
     auto stype = llvm::StructType::get(*llvm_context, types, false);
     auto value_arr = builder->CreateAlloca(stype);
+
     for (int i = 0; i < values.size(); i++) {
       auto value_ptr = builder->CreateGEP(
           stype, value_arr, {tlctx->get_constant(0), tlctx->get_constant(i)});
@@ -74,7 +75,7 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
         builder.get(), "vprintf",
         builder->CreateGlobalStringPtr(format, "format_string"),
         builder->CreateBitCast(value_arr,
-                               llvm::Type::getInt8PtrTy(*llvm_context)));
+                               llvm::PointerType::get(*llvm_context, 0)));
   }
 
   std::tuple<llvm::Value *, llvm::Type *> create_value_and_type(
@@ -341,6 +342,8 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
     UNARY_STD(sgn)
     UNARY_STD(acos)
     UNARY_STD(asin)
+    UNARY_STD(erf)
+    UNARY_STD(erfc)
     else {
       TI_P(unary_op_type_name(op));
       TI_NOT_IMPLEMENTED
@@ -605,23 +608,27 @@ class TaskCodeGenCUDA : public TaskCodeGenLLVM {
 
   llvm::Value *create_intrinsic_load(llvm::Value *ptr,
                                      llvm::Type *ty) override {
-    // Issue an "__ldg" instruction to cache data in the read-only data cache.
-    auto intrin = ty->isFloatingPointTy() ? llvm::Intrinsic::nvvm_ldg_global_f
-                                          : llvm::Intrinsic::nvvm_ldg_global_i;
-    // Special treatment for bool types. As nvvm_ldg_global_i does not support
-    // 1-bit integer, so we convert them to i8.
-    if (ty->getScalarSizeInBits() == 1) {
-      auto *new_ty = tlctx->get_data_type<uint8>();
-      auto *new_ptr =
-          builder->CreatePointerCast(ptr, llvm::PointerType::get(new_ty, 0));
-      auto *v = builder->CreateIntrinsic(
-          intrin, {new_ty, llvm::PointerType::get(new_ty, 0)},
-          {new_ptr, tlctx->get_constant(new_ty->getScalarSizeInBits())});
-      return builder->CreateIsNotNull(v);
-    }
-    return builder->CreateIntrinsic(
-        intrin, {ty, llvm::PointerType::get(ty, 0)},
-        {ptr, tlctx->get_constant(ty->getScalarSizeInBits())});
+    // The llvm.nvvm.ldg.global.* intrinsics have been removed.
+    // They are replaced by a standard load from global address space 1
+    // with !invariant.load metadata.
+
+    // The address space for read-only cache loads is 1 (global).
+    llvm::PointerType *ptr_ty_addrspace_1 = llvm::PointerType::get(ty, 1);
+
+    // Cast the input pointer to the correct address space.
+    llvm::Value *cast_ptr =
+        builder->CreateAddrSpaceCast(ptr, ptr_ty_addrspace_1);
+
+    // Create the load instruction.
+    llvm::LoadInst *load = builder->CreateLoad(ty, cast_ptr);
+
+    // Attach the !invariant.load metadata.
+    llvm::MDNode *invariant_load_metadata =
+        llvm::MDNode::get(builder->getContext(), {});
+    load->setMetadata(llvm::LLVMContext::MD_invariant_load,
+                      invariant_load_metadata);
+
+    return load;
   }
 
   void visit(GlobalLoadStmt *stmt) override {
diff --git a/taichi/codegen/dx12/dx12_global_optimize_module.cpp b/taichi/codegen/dx12/dx12_global_optimize_module.cpp
index 2e44c51a965a5..7f8305973ed16 100644
--- a/taichi/codegen/dx12/dx12_global_optimize_module.cpp
+++ b/taichi/codegen/dx12/dx12_global_optimize_module.cpp
@@ -1,4 +1,3 @@
-
 #include "taichi/common/core.h"
 #include "taichi/util/io.h"
 #include "taichi/program/program.h"
@@ -12,25 +11,30 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/Function.h"
 
+// === CHANGED SECTION: HEADER INCLUDES ===
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Attributes.h"
-#include "llvm/Support/Host.h"
+// #include "llvm/Support/Host.h" // This was not used, but good to be aware of.
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LegacyPassManager.h"
+// #include "llvm/IR/LegacyPassManager.h" // Obsolete: Removed
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
-#include "llvm/Transforms/InstCombine/InstCombine.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/GVN.h"
-#include "llvm/Transforms/IPO.h"
+// #include "llvm/Transforms/IPO/PassManagerBuilder.h" // Obsolete: Removed
+// #include "llvm/Transforms/InstCombine/InstCombine.h" // Included via PassBuilder
+// #include "llvm/Transforms/Scalar.h" // Included via PassBuilder
+// #include "llvm/Transforms/Scalar/GVN.h" // Included via PassBuilder
+// #include "llvm/Transforms/IPO.h" // Included via PassBuilder
 #include "llvm/ADT/SmallString.h"
 #include "llvm/IR/GlobalVariable.h"
+// New includes for the New Pass Manager (NPM)
+#include "llvm/Passes/PassBuilder.h"
+// === END OF CHANGED SECTION ===
+
 
 using namespace llvm;
 
@@ -64,6 +68,9 @@ GlobalVariable *createGlobalVariableForResource(Module &M,
   return GV;
 }
 
+// === CHANGED SECTION: ENTIRE FUNCTION REWRITTEN ===
+// The `global_optimize_module` function has been completely rewritten to use the
+// New Pass Manager (NPM) instead of the removed Legacy Pass Manager (LPM).
 std::vector<uint8_t> global_optimize_module(llvm::Module *module,
                                             const CompileConfig &config) {
   TI_AUTO_PROF
@@ -75,8 +82,10 @@ std::vector<uint8_t> global_optimize_module(llvm::Module *module,
   for (llvm::Function &F : module->functions()) {
     if (directx12::is_cs_entry(&F))
       continue;
+    // Mark other functions for inlining.
     F.addFnAttr(llvm::Attribute::AlwaysInline);
   }
+
   // FIXME: choose shader model based on feature used.
   llvm::StringRef triple = "dxil-pc-shadermodel6.0-compute";
   module->setTargetTriple(triple);
@@ -89,72 +98,75 @@ std::vector<uint8_t> global_optimize_module(llvm::Module *module,
   TargetOptions options;
   if (config.fast_math) {
     options.AllowFPOpFusion = FPOpFusion::Fast;
-    options.UnsafeFPMath = 1;
-    options.NoInfsFPMath = 1;
-    options.NoNaNsFPMath = 1;
-  } else {
-    options.AllowFPOpFusion = FPOpFusion::Strict;
-    options.UnsafeFPMath = 0;
-    options.NoInfsFPMath = 0;
-    options.NoNaNsFPMath = 0;
+    options.UnsafeFPMath = true;
+    options.NoInfsFPMath = true;
+    options.NoNaNsFPMath = true;
   }
   options.HonorSignDependentRoundingFPMathOption = false;
   options.NoZerosInBSS = false;
   options.GuaranteedTailCallOpt = false;
 
-  legacy::FunctionPassManager function_pass_manager(module);
-  legacy::PassManager module_pass_manager;
-
   llvm::StringRef mcpu = "";
   std::unique_ptr<TargetMachine> target_machine(target->createTargetMachine(
       triple.str(), mcpu.str(), "", options, llvm::Reloc::PIC_,
       llvm::CodeModel::Small,
       config.opt_level > 0 ? CodeGenOpt::Aggressive : CodeGenOpt::None));
 
-  TI_ERROR_UNLESS(target_machine.get(), "Could not allocate target machine!");
+  TI_ERROR_UNLESS(target_machine, "Could not allocate target machine!");
 
   module->setDataLayout(target_machine->createDataLayout());
 
-  // Lower taichi intrinsic first.
-  module_pass_manager.add(createTaichiIntrinsicLowerPass(&config));
-
-  module_pass_manager.add(createTargetTransformInfoWrapperPass(
-      target_machine->getTargetIRAnalysis()));
-  function_pass_manager.add(createTargetTransformInfoWrapperPass(
-      target_machine->getTargetIRAnalysis()));
-
-  PassManagerBuilder b;
-  b.OptLevel = 3;
-  b.Inliner = createFunctionInliningPass(b.OptLevel, 0, false);
-  b.LoopVectorize = true;
-  b.SLPVectorize = true;
-
-  target_machine->adjustPassManager(b);
+  // === New Pass Manager Setup ===
+  // 1. Create the analysis managers.
+  llvm::LoopAnalysisManager LAM;
+  llvm::FunctionAnalysisManager FAM;
+  llvm::CGSCCAnalysisManager CGAM;
+  llvm::ModuleAnalysisManager MAM;
+
+  // 2. Create the PassBuilder.
+  llvm::PassBuilder PB(target_machine.get());
+
+  // 3. Register all the standard analyses.
+  FAM.registerPass([&] { return target_machine->getTargetIRAnalysis(); });
+  PB.registerModuleAnalyses(MAM);
+  PB.registerCGSCCAnalyses(CGAM);
+  PB.registerFunctionAnalyses(FAM);
+  PB.registerLoopAnalyses(LAM);
+  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+  // 4. Create the main pass manager.
+  llvm::ModulePassManager MPM;
+  
+  // Lower taichi intrinsic first. This is a custom pass.
+  MPM.addPass(createTaichiIntrinsicLowerPass(&config));
+  
+  // 5. Build the default optimization pipeline for O3.
+  llvm::PassBuilder::OptimizationLevel opt_level = llvm::PassBuilder::OptimizationLevel::O3;
+  // This will add inlining, vectorization, etc., replacing `PassManagerBuilder`.
+  // Note: We are now creating a more complex pipeline. We can use `buildPerModuleDefaultPipeline`
+  // but to insert passes in the middle, we construct it manually. A simpler way is to
+  // use `parsePassPipeline`. For now, we build the default pipeline first.
+  if (config.opt_level > 0) {
+      MPM = PB.buildPerModuleDefaultPipeline(opt_level);
+  }
 
-  b.populateFunctionPassManager(function_pass_manager);
-  b.populateModulePassManager(module_pass_manager);
-  // Add passes after inline.
-  module_pass_manager.add(createTaichiRuntimeContextLowerPass());
+  // Add the second custom pass, which should run after inlining.
+  MPM.addPass(createTaichiRuntimeContextLowerPass());
 
-  llvm::SmallString<256> str;
+  llvm::SmallString<0> str;
   llvm::raw_svector_ostream OS(str);
-  // Write DXIL container to OS.
-  target_machine->addPassesToEmitFile(module_pass_manager, OS, nullptr,
-                                      CGFT_ObjectFile);
-
-  {
-    TI_PROFILER("llvm_function_pass");
-    function_pass_manager.doInitialization();
-    for (llvm::Module::iterator i = module->begin(); i != module->end(); i++)
-      function_pass_manager.run(*i);
-
-    function_pass_manager.doFinalization();
+  
+  // 6. Add the pass to emit the object file to the stream.
+  if (auto err = target_machine->addPassesToEmitFile(MPM, OS, nullptr, CGFT_ObjectFile)) {
+    TI_ERROR("Failed to addPassesToEmitFile");
   }
 
+  // 7. Run the entire pipeline.
   {
     TI_PROFILER("llvm_module_pass");
-    module_pass_manager.run(*module);
+    MPM.run(*module, MAM);
   }
+
   if (config.print_kernel_llvm_ir_optimized) {
     static FileSequenceWriter writer(
         "taichi_kernel_dx12_llvm_ir_optimized_{:04d}.ll",
@@ -163,6 +175,5 @@ std::vector<uint8_t> global_optimize_module(llvm::Module *module,
   }
   return std::vector<uint8_t>(str.begin(), str.end());
 }
-
 }  // namespace directx12
-}  // namespace taichi::lang
+}  // namespace taichi::lang
\ No newline at end of file
diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp
index 85e9b1788098e..c0f4d8af21fd0 100644
--- a/taichi/codegen/llvm/codegen_llvm.cpp
+++ b/taichi/codegen/llvm/codegen_llvm.cpp
@@ -1,3 +1,4 @@
+
 #include "taichi/codegen/llvm/codegen_llvm.h"
 
 #include <algorithm>
@@ -199,6 +200,9 @@ void TaskCodeGenLLVM::emit_extra_unary(UnaryOpStmt *stmt) {
   UNARY_STD(asin)
   UNARY_STD(cos)
   UNARY_STD(sin)
+  UNARY_STD(erf)
+  UNARY_STD(erfc)
+
   else if (op == UnaryOpType::sqrt) {
     llvm_val[stmt] =
         builder->CreateIntrinsic(llvm::Intrinsic::sqrt, {input_type}, {input});
@@ -1679,10 +1683,9 @@ llvm::Value *TaskCodeGenLLVM::call(
   auto prefix = get_runtime_snode_name(snode);
   auto s = emit_struct_meta(snode);
   auto s_ptr =
-      builder->CreateBitCast(s, llvm::Type::getInt8PtrTy(*llvm_context));
-
-  node_ptr =
-      builder->CreateBitCast(node_ptr, llvm::Type::getInt8PtrTy(*llvm_context));
+      builder->CreateBitCast(s, llvm::PointerType::get(*llvm_context, 0));
+  node_ptr = builder->CreateBitCast(node_ptr,
+                                    llvm::PointerType::get(*llvm_context, 0));
 
   std::vector<llvm::Value *> func_arguments{s_ptr, node_ptr};
 
@@ -1794,14 +1797,18 @@ void TaskCodeGenLLVM::visit(SNodeLookupStmt *stmt) {
   parent = llvm_val[stmt->input_snode];
   TI_ASSERT(parent);
   auto snode = stmt->snode;
+
   if (snode->type == SNodeType::root) {
     // FIXME: get parent_type from taichi instead of llvm.
     llvm::Type *parent_ty = builder->getInt8Ty();
+
     if (auto bit_cast = llvm::dyn_cast<llvm::BitCastInst>(parent)) {
       parent_ty = bit_cast->getDestTy();
-      if (auto ptr_ty = llvm::dyn_cast<llvm::PointerType>(parent_ty))
-        parent_ty = ptr_ty->getPointerElementType();
+      if (auto ptr_ty = llvm::dyn_cast<llvm::PointerType>(parent_ty)) {
+        TI_NOT_IMPLEMENTED;
+      }
     }
+
     llvm_val[stmt] =
         builder->CreateGEP(parent_ty, parent, llvm_val[stmt->input_index]);
   } else if (snode->type == SNodeType::dense ||
@@ -1842,7 +1849,7 @@ void TaskCodeGenLLVM::visit(GetChStmt *stmt) {
         stmt->output_snode->get_snode_tree_id(),
         stmt->output_snode->get_ch_from_parent_func_name(),
         builder->CreateBitCast(llvm_val[stmt->input_ptr],
-                               llvm::PointerType::getInt8PtrTy(*llvm_context)));
+                               llvm::PointerType::get(*llvm_context, 0)));
     llvm_val[stmt] = builder->CreateBitCast(
         ch, llvm::PointerType::get(StructCompilerLLVM::get_llvm_node_type(
                                        module.get(), stmt->output_snode),
@@ -2436,8 +2443,8 @@ void TaskCodeGenLLVM::visit(AdStackAllocaStmt *stmt) {
   auto type = llvm::ArrayType::get(llvm::Type::getInt8Ty(*llvm_context),
                                    stmt->size_in_bytes());
   auto alloca = create_entry_block_alloca(type, sizeof(int64));
-  llvm_val[stmt] = builder->CreateBitCast(
-      alloca, llvm::PointerType::getInt8PtrTy(*llvm_context));
+  llvm_val[stmt] =
+      builder->CreateBitCast(alloca, llvm::PointerType::get(*llvm_context, 0));
   call("stack_init", llvm_val[stmt]);
 }
 
@@ -2629,7 +2636,7 @@ llvm::Value *TaskCodeGenLLVM::get_tls_base_ptr() {
 }
 
 llvm::Type *TaskCodeGenLLVM::get_tls_buffer_type() {
-  return llvm::Type::getInt8PtrTy(*llvm_context);
+  return llvm::PointerType::get(*llvm_context, 0);
 }
 
 std::vector<llvm::Type *> TaskCodeGenLLVM::get_xlogue_argument_types() {
@@ -2653,6 +2660,7 @@ llvm::Type *TaskCodeGenLLVM::get_mesh_xlogue_function_type() {
 }
 
 llvm::PointerType *TaskCodeGenLLVM::get_integer_ptr_type(int bits) {
+#if 0
   switch (bits) {
     case 8:
       return llvm::Type::getInt8PtrTy(*llvm_context);
@@ -2665,6 +2673,18 @@ llvm::PointerType *TaskCodeGenLLVM::get_integer_ptr_type(int bits) {
     default:
       break;
   }
+#else
+  // opaque pointer
+  switch (bits) {
+    case 8:
+    case 16:
+    case 32:
+    case 64:
+      return llvm::PointerType::get(*llvm_context, 0);
+    default:
+      break;
+  }
+#endif
   TI_ERROR("No compatible " + std::to_string(bits) + " bits integer ptr type.");
   return nullptr;
 }
@@ -2989,4 +3009,4 @@ LLVMCompiledKernel LLVMCompiledKernel::clone() const {
 
 }  // namespace taichi::lang
 
-#endif  // #ifdef TI_WITH_LLVM
+#endif  // #ifdef TI_WITH_LLVM
\ No newline at end of file
diff --git a/taichi/codegen/llvm/codegen_llvm.h b/taichi/codegen/llvm/codegen_llvm.h
index 00d025b61d1e6..da6ba2b1279a2 100644
--- a/taichi/codegen/llvm/codegen_llvm.h
+++ b/taichi/codegen/llvm/codegen_llvm.h
@@ -1,3 +1,4 @@
+
 // The LLVM backend for CPUs/NVPTX/AMDGPU
 #pragma once
 
@@ -438,4 +439,4 @@ class TaskCodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
 
 }  // namespace taichi::lang
 
-#endif  // #ifdef TI_WITH_LLVM
+#endif  // #ifdef TI_WITH_LLVM
\ No newline at end of file
diff --git a/taichi/codegen/llvm/llvm_codegen_utils.cpp b/taichi/codegen/llvm/llvm_codegen_utils.cpp
index 99a06c403e61f..0f23f14ed0f9c 100644
--- a/taichi/codegen/llvm/llvm_codegen_utils.cpp
+++ b/taichi/codegen/llvm/llvm_codegen_utils.cpp
@@ -25,15 +25,23 @@ bool is_same_type(llvm::Type *a, llvm::Type *b) {
   if (a == b) {
     return true;
   }
+
   if (a->isPointerTy() != b->isPointerTy()) {
     return false;
   }
+
   if (a->isPointerTy()) {
+#if 0
     return is_same_type(a->getPointerElementType(), b->getPointerElementType());
+#else
+    return true;
+#endif
   }
+
   if (a->isFunctionTy() != b->isFunctionTy()) {
     return false;
   }
+
   if (a->isFunctionTy()) {
     auto req_func = llvm::cast<llvm::FunctionType>(a);
     auto prov_func = llvm::cast<llvm::FunctionType>(b);
@@ -144,4 +152,4 @@ void check_func_call_signature(llvm::FunctionType *func_type,
   }
 }
 
-}  // namespace taichi::lang
+}  // namespace taichi::lang
\ No newline at end of file
diff --git a/taichi/codegen/llvm/struct_llvm.cpp b/taichi/codegen/llvm/struct_llvm.cpp
index 849b1f0b3bb3d..5e10b6b3116de 100644
--- a/taichi/codegen/llvm/struct_llvm.cpp
+++ b/taichi/codegen/llvm/struct_llvm.cpp
@@ -8,7 +8,6 @@
 #include "taichi/util/file_sequence_writer.h"
 
 namespace taichi::lang {
-
 StructCompilerLLVM::StructCompilerLLVM(Arch arch,
                                        const CompileConfig &config,
                                        TaichiLLVMContext *tlctx,
@@ -104,14 +103,14 @@ void StructCompilerLLVM::generate_types(SNode &snode) {
     // mutex
     aux_type = llvm::ArrayType::get(llvm::PointerType::getInt64Ty(*ctx),
                                     snode.max_num_elements());
-    body_type = llvm::ArrayType::get(llvm::PointerType::getInt8PtrTy(*ctx),
+    body_type = llvm::ArrayType::get(llvm::PointerType::get(*ctx, 0),
                                      snode.max_num_elements());
   } else if (type == SNodeType::dynamic) {
     // mutex and n (number of elements)
     aux_type =
         llvm::StructType::get(*ctx, {llvm::PointerType::getInt32Ty(*ctx),
                                      llvm::PointerType::getInt32Ty(*ctx)});
-    body_type = llvm::PointerType::getInt8PtrTy(*ctx);
+    body_type = llvm::PointerType::get(*ctx, 0);
   } else {
     TI_P(snode.type_name());
     TI_NOT_IMPLEMENTED;
@@ -206,10 +205,9 @@ void StructCompilerLLVM::generate_child_accessors(SNode &snode) {
 
     auto inp_type =
         llvm::PointerType::get(get_llvm_element_type(module.get(), parent), 0);
-
     auto ft =
-        llvm::FunctionType::get(llvm::Type::getInt8PtrTy(*llvm_ctx_),
-                                {llvm::Type::getInt8PtrTy(*llvm_ctx_)}, false);
+        llvm::FunctionType::get(llvm::PointerType::get(*llvm_ctx_, 0),
+                                {llvm::PointerType::get(*llvm_ctx_, 0)}, false);
 
     auto func = create_function(ft, snode.get_ch_from_parent_func_name());
 
@@ -221,15 +219,14 @@ void StructCompilerLLVM::generate_child_accessors(SNode &snode) {
     for (auto &arg : func->args()) {
       args.push_back(&arg);
     }
+
     llvm::Value *ret;
     ret = builder.CreateGEP(get_llvm_element_type(module.get(), parent),
                             builder.CreateBitCast(args[0], inp_type),
                             {tlctx_->get_constant(0),
                              tlctx_->get_constant(parent->child_id(&snode))},
                             "getch");
-
-    builder.CreateRet(
-        builder.CreateBitCast(ret, llvm::Type::getInt8PtrTy(*llvm_ctx_)));
+    builder.CreateRet(ret);
   }
 
   for (auto &ch : snode.ch) {
@@ -311,5 +308,4 @@ llvm::Function *StructCompilerLLVM::create_function(llvm::FunctionType *ft,
   return llvm::Function::Create(ft, llvm::Function::ExternalLinkage, func_name,
                                 *module);
 }
-
-}  // namespace taichi::lang
+}  // namespace taichi::lang
\ No newline at end of file
diff --git a/taichi/common/core.h b/taichi/common/core.h
index f112f8f35baad..e704943069672 100644
--- a/taichi/common/core.h
+++ b/taichi/common/core.h
@@ -167,38 +167,38 @@ using real = float32;
 
 // Float literal for both float32/64
 // (Learned from https://github.com/hi2p-perim/lightmetrica-v2)
-real constexpr operator"" _f(long double v) {
+real constexpr operator""_f(long double v) {
   return real(v);
 }
-real constexpr operator"" _f(unsigned long long v) {
+real constexpr operator""_f(unsigned long long v) {
   return real(v);
 }
 
-float32 constexpr operator"" _f32(long double v) {
+float32 constexpr operator""_f32(long double v) {
   return float32(v);
 }
-float32 constexpr operator"" _f32(unsigned long long v) {
+float32 constexpr operator""_f32(unsigned long long v) {
   return float32(v);
 }
 
-float32 constexpr operator"" _fs(long double v) {
+float32 constexpr operator""_fs(long double v) {
   return float32(v);
 }
-float32 constexpr operator"" _fs(unsigned long long v) {
+float32 constexpr operator""_fs(unsigned long long v) {
   return float32(v);
 }
 
-float64 constexpr operator"" _f64(long double v) {
+float64 constexpr operator""_f64(long double v) {
   return float64(v);
 }
-float64 constexpr operator"" _f64(unsigned long long v) {
+float64 constexpr operator""_f64(unsigned long long v) {
   return float64(v);
 }
 
-float64 constexpr operator"" _fd(long double v) {
+float64 constexpr operator""_fd(long double v) {
   return float64(v);
 }
-float64 constexpr operator"" _fd(unsigned long long v) {
+float64 constexpr operator""_fd(unsigned long long v) {
   return float64(v);
 }
 
diff --git a/taichi/common/types.h b/taichi/common/types.h
index b4728a6daa22d..89dbc7c2e2be9 100644
--- a/taichi/common/types.h
+++ b/taichi/common/types.h
@@ -38,14 +38,14 @@ using real = float32;
 
 // Float literal for both float32/64
 // (Learned from https://github.com/hi2p-perim/lightmetrica-v2)
-real constexpr operator"" _f(long double v) {
+real constexpr operator""_f(long double v) {
   return real(v);
 }
 real constexpr operator"" _f(unsigned long long v) {
   return real(v);
 }
 
-float32 constexpr operator"" _f32(long double v) {
+float32 constexpr operator""_f32(long double v) {
   return float32(v);
 }
 float32 constexpr operator"" _f32(unsigned long long v) {
diff --git a/taichi/inc/unary_op.inc.h b/taichi/inc/unary_op.inc.h
index b7fa8abc109b9..d3102c50faf2d 100644
--- a/taichi/inc/unary_op.inc.h
+++ b/taichi/inc/unary_op.inc.h
@@ -24,3 +24,5 @@ PER_UNARY_OP(rsqrt)
 PER_UNARY_OP(bit_not)
 PER_UNARY_OP(logic_not)
 PER_UNARY_OP(undefined)
+PER_UNARY_OP(erf)
+PER_UNARY_OP(erfc)
\ No newline at end of file
diff --git a/taichi/ir/expression_ops.h b/taichi/ir/expression_ops.h
index 120b423cdda26..17504685a8d07 100644
--- a/taichi/ir/expression_ops.h
+++ b/taichi/ir/expression_ops.h
@@ -95,6 +95,8 @@ DEFINE_EXPRESSION_FUNC_UNARY(log)
 DEFINE_EXPRESSION_FUNC_UNARY(popcnt)
 DEFINE_EXPRESSION_FUNC_UNARY(clz)
 DEFINE_EXPRESSION_FUNC_UNARY(logic_not)
+DEFINE_EXPRESSION_FUNC_UNARY(erf)
+DEFINE_EXPRESSION_FUNC_UNARY(erfc)
 DEFINE_EXPRESSION_OP_UNARY(~, bit_not)
 DEFINE_EXPRESSION_OP_UNARY(-, neg)
 
diff --git a/taichi/ir/ir_builder.cpp b/taichi/ir/ir_builder.cpp
index b28412cd441f4..a7ab59a9571be 100644
--- a/taichi/ir/ir_builder.cpp
+++ b/taichi/ir/ir_builder.cpp
@@ -1,3 +1,4 @@
+
 #include "taichi/ir/ir_builder.h"
 #include "taichi/ir/statements.h"
 #include "taichi/common/logging.h"
@@ -129,6 +130,12 @@ ContinueStmt *IRBuilder::create_continue() {
   return insert(Stmt::make_typed<ContinueStmt>());
 }
 
+void IRBuilder::create_assert(Stmt *cond, const std::string &msg) {
+  std::vector<Stmt *> empty_args;
+  auto assert_stmt = Stmt::make_typed<AssertStmt>(cond, msg, empty_args);
+  insert(std::move(assert_stmt));
+}
+
 FuncCallStmt *IRBuilder::create_func_call(Function *func,
                                           const std::vector<Stmt *> &args) {
   return insert(Stmt::make_typed<FuncCallStmt>(func, args));
@@ -138,6 +145,12 @@ LoopIndexStmt *IRBuilder::get_loop_index(Stmt *loop, int index) {
   return insert(Stmt::make_typed<LoopIndexStmt>(loop, index));
 }
 
+ConstStmt *IRBuilder::get_bool(bool value) {
+  return insert(Stmt::make_typed<ConstStmt>(TypedConstant(
+      TypeFactory::get_instance().get_primitive_type(PrimitiveTypeID::u1),
+      value)));
+}
+
 ConstStmt *IRBuilder::get_int32(int32 value) {
   return insert(Stmt::make_typed<ConstStmt>(TypedConstant(
       TypeFactory::get_instance().get_primitive_type(PrimitiveTypeID::i32),
@@ -181,9 +194,10 @@ RandStmt *IRBuilder::create_rand(DataType value_type) {
 ArgLoadStmt *IRBuilder::create_arg_load(const std::vector<int> &arg_id,
                                         DataType dt,
                                         bool is_ptr,
-                                        int arg_depth) {
-  return insert(Stmt::make_typed<ArgLoadStmt>(arg_id, dt, is_ptr,
-                                              /*create_load*/ true, arg_depth));
+                                        int arg_depth,
+                                        bool create_load) {
+  return insert(Stmt::make_typed<ArgLoadStmt>(arg_id, dt, is_ptr, create_load,
+                                              arg_depth));
 }
 
 ReturnStmt *IRBuilder::create_return(Stmt *value) {
@@ -282,6 +296,14 @@ UnaryOpStmt *IRBuilder::create_clz(Stmt *value) {
   return insert(Stmt::make_typed<UnaryOpStmt>(UnaryOpType::clz, value));
 }
 
+UnaryOpStmt *IRBuilder::create_erf(Stmt *value) {
+  return insert(Stmt::make_typed<UnaryOpStmt>(UnaryOpType::erf, value));
+}
+
+UnaryOpStmt *IRBuilder::create_erfc(Stmt *value) {
+  return insert(Stmt::make_typed<UnaryOpStmt>(UnaryOpType::erfc, value));
+}
+
 BinaryOpStmt *IRBuilder::create_add(Stmt *l, Stmt *r) {
   return insert(Stmt::make_typed<BinaryOpStmt>(BinaryOpType::add, l, r));
 }
diff --git a/taichi/ir/ir_builder.h b/taichi/ir/ir_builder.h
index c585ed7be425e..0bd88e064aec2 100644
--- a/taichi/ir/ir_builder.h
+++ b/taichi/ir/ir_builder.h
@@ -1,3 +1,4 @@
+
 #pragma once
 
 #include "taichi/ir/ir.h"
@@ -121,6 +122,7 @@ class IRBuilder {
   IfStmt *create_if(Stmt *cond);
   WhileControlStmt *create_break();
   ContinueStmt *create_continue();
+  void create_assert(Stmt *cond, const std::string &msg);
 
   // Function.
   FuncCallStmt *create_func_call(Function *func,
@@ -130,6 +132,7 @@ class IRBuilder {
   LoopIndexStmt *get_loop_index(Stmt *loop, int index = 0);
 
   // Constants. TODO: add more types
+  ConstStmt *get_bool(bool value);
   ConstStmt *get_int32(int32 value);
   ConstStmt *get_int64(int64 value);
   ConstStmt *get_uint32(uint32 value);
@@ -148,7 +151,8 @@ class IRBuilder {
   ArgLoadStmt *create_arg_load(const std::vector<int> &arg_id,
                                DataType dt,
                                bool is_ptr,
-                               int arg_depth);
+                               int arg_depth,
+                               bool create_load = true);
   // Load kernel arguments.
   ArgLoadStmt *create_ndarray_arg_load(const std::vector<int> &arg_id,
                                        DataType dt,
@@ -181,6 +185,8 @@ class IRBuilder {
   UnaryOpStmt *create_log(Stmt *value);
   UnaryOpStmt *create_popcnt(Stmt *value);
   UnaryOpStmt *create_clz(Stmt *value);
+  UnaryOpStmt *create_erf(Stmt *value);
+  UnaryOpStmt *create_erfc(Stmt *value);
 
   // Binary operations. Returns the result.
   BinaryOpStmt *create_add(Stmt *l, Stmt *r);
diff --git a/taichi/math/linalg.h b/taichi/math/linalg.h
index 4d31bb5e7fb10..628572ac4976c 100644
--- a/taichi/math/linalg.h
+++ b/taichi/math/linalg.h
@@ -242,7 +242,9 @@ struct VectorND : public VectorNDBase<dim__, T, ISE> {
   }
 
   TI_FORCE_INLINE VectorND &operator=(const VectorND &o) {
-    memcpy(this, &o, sizeof(*this));
+    for (int i = 0; i < dim; i++) {
+      this->d[i] = o.d[i];
+    }
     return *this;
   }
 
diff --git a/taichi/runtime/amdgpu/jit_amdgpu.cpp b/taichi/runtime/amdgpu/jit_amdgpu.cpp
index 87903f0a35404..b7a8c349886e2 100644
--- a/taichi/runtime/amdgpu/jit_amdgpu.cpp
+++ b/taichi/runtime/amdgpu/jit_amdgpu.cpp
@@ -64,12 +64,17 @@ std::string JITSessionAMDGPU::compile_module_to_hsaco(
   options.NoZerosInBSS = 0;
   options.GuaranteedTailCallOpt = 0;
 
+#if LLVM_VERSION_MAJOR >= 18
+  const auto opt_level = llvm::CodeGenOptLevel::Aggressive;
+#else
+  const auto opt_level = llvm::CodeGenOpt::Aggressive;
+#endif
+
   std::unique_ptr<llvm::TargetMachine> machine(target->createTargetMachine(
       triple_str, AMDGPUContext::get_instance().get_mcpu(), "", options,
-      llvm::Reloc::PIC_, llvm::CodeModel::Small, llvm::CodeGenOpt::Aggressive));
+      llvm::Reloc::PIC_, llvm::CodeModel::Small, opt_level));
 
   llvm_module->setDataLayout(machine->createDataLayout());
-
   if (this->config_.print_kernel_amdgcn) {
     // Amdgcn will not generated during generating hsaco file
     // It's an interim impl
@@ -84,11 +89,47 @@ std::string JITSessionAMDGPU::compile_module_to_hsaco(
     // another way
     //    llvm-objdump -d xxxx.hsaco(can ensure that hsaco and gcn correspond to
     //    each other)
-
     auto module_clone = llvm::CloneModule(*llvm_module);
-    llvm::legacy::PassManager module_gen_gcn_pass_manager;
     llvm::SmallString<0> gcnstr;
     llvm::raw_svector_ostream llvm_stream_gcn(gcnstr);
+#if LLVM_VERSION_MAJOR >= 17
+   // Create the new analysis manager
+    llvm::LoopAnalysisManager LAM;
+    llvm::FunctionAnalysisManager FAM;
+    llvm::CGSCCAnalysisManager CGAM;
+    llvm::ModuleAnalysisManager MAM;
+
+  // Create the new pass builder
+    llvm::PipelineTuningOptions PTO;
+    PTO.LoopInterleaving = false;
+    PTO.LoopVectorization = false;
+    PTO.SLPVectorization = true;
+    PTO.LoopUnrolling = false;
+    PTO.ForgetAllSCEVInLoopUnroll = true;
+
+    std::unique_ptr<llvm::TargetMachine> machine_gen_gcn(
+        target->createTargetMachine(
+            triple_str, AMDGPUContext::get_instance().get_mcpu(), "", options,
+            llvm::Reloc::PIC_, llvm::CodeModel::Small,
+            opt_level));
+
+    llvm::PassBuilder module_gen_gcn_pass_manager(machine_gen_gcn.get(), PTO);
+
+    module_gen_gcn_pass_manager.registerModuleAnalyses(MAM);
+    module_gen_gcn_pass_manager.registerCGSCCAnalyses(CGAM);
+    module_gen_gcn_pass_manager.registerFunctionAnalyses(FAM);
+    module_gen_gcn_pass_manager.registerLoopAnalyses(LAM);
+    module_gen_gcn_pass_manager.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+    llvm::ModulePassManager builder =
+      module_gen_gcn_pass_manager.buildPerModuleDefaultPipeline(llvm::OptimizationLevel::O3);
+
+    machine_gen_gcn->registerPassBuilderCallbacks(module_gen_gcn_pass_manager);
+
+    builder.run(*module_clone, MAM);
+#else
+
+    llvm::legacy::PassManager module_gen_gcn_pass_manager;
     std::unique_ptr<llvm::TargetMachine> machine_gen_gcn(
         target->createTargetMachine(
             triple_str, AMDGPUContext::get_instance().get_mcpu(), "", options,
@@ -105,11 +146,43 @@ std::string JITSessionAMDGPU::compile_module_to_hsaco(
                                          llvm_stream_gcn, nullptr,
                                          llvm::CGFT_AssemblyFile, true);
     module_gen_gcn_pass_manager.run(*module_clone);
+#endif
     std::string gcn(gcnstr.begin(), gcnstr.end());
     static FileSequenceWriter writer("taichi_kernel_amdgcn_{:04d}.gcn",
                                      "module AMDGCN");
     writer.write(gcn);
   }
+#if LLVM_VERSION_MAJOR >= 17
+
+   // Create the new analysis manager
+    llvm::LoopAnalysisManager LAM;
+    llvm::FunctionAnalysisManager FAM;
+    llvm::CGSCCAnalysisManager CGAM;
+    llvm::ModuleAnalysisManager MAM;
+
+  // Create the new pass builder
+    llvm::PipelineTuningOptions PTO;
+    PTO.LoopInterleaving = false;
+    PTO.LoopVectorization = false;
+    PTO.SLPVectorization = true;
+    PTO.LoopUnrolling = false;
+    PTO.ForgetAllSCEVInLoopUnroll = true;
+    
+    llvm::PassBuilder module_pass_manager(machine.get(), PTO);
+
+    module_pass_manager.registerModuleAnalyses(MAM);
+    module_pass_manager.registerCGSCCAnalyses(CGAM);
+    module_pass_manager.registerFunctionAnalyses(FAM);
+    module_pass_manager.registerLoopAnalyses(LAM);
+    module_pass_manager.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+    llvm::ModulePassManager builder =
+      module_pass_manager.buildPerModuleDefaultPipeline(llvm::OptimizationLevel::O3);
+
+    machine->registerPassBuilderCallbacks(module_pass_manager);
+
+    builder.run(*llvm_module, MAM);
+#else
 
   llvm::legacy::FunctionPassManager function_pass_manager(llvm_module.get());
   llvm::legacy::PassManager module_pass_manager;
@@ -126,6 +199,7 @@ std::string JITSessionAMDGPU::compile_module_to_hsaco(
   machine->adjustPassManager(builder);
   builder.populateFunctionPassManager(function_pass_manager);
   builder.populateModulePassManager(module_pass_manager);
+#endif
 
   machine->Options.MCOptions.AsmVerbose = true;
 
@@ -142,6 +216,14 @@ std::string JITSessionAMDGPU::compile_module_to_hsaco(
   llvm::SmallString<0> outstr;
   llvm::raw_svector_ostream llvm_stream(outstr);
 
+#if LLVM_VERSION_MAJOR >= 17
+  llvm::legacy::PassManager legacy_pass_manager;
+  legacy_pass_manager.add(llvm::createTargetTransformInfoWrapperPass(
+      machine->getTargetIRAnalysis()));
+  machine->addPassesToEmitFile(legacy_pass_manager, llvm_stream, nullptr,
+                                llvm::CodeGenFileType::ObjectFile, true);
+  legacy_pass_manager.run(*llvm_module);
+#else
   machine->addPassesToEmitFile(module_pass_manager, llvm_stream, nullptr,
                                llvm::CGFT_ObjectFile, true);
 
@@ -150,6 +232,7 @@ std::string JITSessionAMDGPU::compile_module_to_hsaco(
     function_pass_manager.run(*func);
   function_pass_manager.doFinalization();
   module_pass_manager.run(*llvm_module);
+#endif
 
   std::string obj_str(outstr.begin(), outstr.end());
   std::ofstream(obj_path) << obj_str;
diff --git a/taichi/runtime/amdgpu/jit_amdgpu.h b/taichi/runtime/amdgpu/jit_amdgpu.h
index 90b051833ddb7..5d798663a6b37 100644
--- a/taichi/runtime/amdgpu/jit_amdgpu.h
+++ b/taichi/runtime/amdgpu/jit_amdgpu.h
@@ -1,3 +1,5 @@
+#pragma once // It's good practice to have include guards
+
 #include <memory>
 #include <utility>
 #include <mutex>
@@ -11,16 +13,22 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
-#include "llvm/Transforms/InstCombine/InstCombine.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/GVN.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#if LLVM_VERSION_MAJOR >= 17
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Passes/OptimizationLevel.h"
+#include "llvm/Transforms/IPO/Inliner.h"
+#include "llvm/IR/Module.h"
+#else
+#include "llvm/IR/LegacyPassManager.h"            // Obsolete: Removed
+#include "llvm/Transforms/InstCombine/InstCombine.h"// Obsolete: Handled by NPM PassBuilder
+#include "llvm/Transforms/Scalar.h"                 // Obsolete: Handled by NPM PassBuilder
+#include "llvm/Transforms/Scalar/GVN.h"             // Obsolete: Handled by NPM PassBuilder
+#include "llvm/Transforms/IPO.h"                    // Obsolete: Handled by NPM PassBuilder
+#include "llvm/Transforms/IPO/PassManagerBuilder.h" // Obsolete: Removed
+#endif
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/MC/TargetRegistry.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 
 #include "taichi/rhi/amdgpu/amdgpu_context.h"
@@ -124,7 +132,7 @@ class JITSessionAMDGPU : public JITSession {
                        (std::istreambuf_iterator<char>()));
   }
 
-  uint64 get_random_num() {
+  uint64_t get_random_num() {
     // Note: ROCm is available only on Linux OS.
     static std::random_device device("/dev/urandom");
     static std::mt19937_64 *rng = new std::mt19937_64(device());
diff --git a/taichi/runtime/cpu/jit_cpu.cpp b/taichi/runtime/cpu/jit_cpu.cpp
index 59cf0379b56ff..f7fe013f2c773 100644
--- a/taichi/runtime/cpu/jit_cpu.cpp
+++ b/taichi/runtime/cpu/jit_cpu.cpp
@@ -3,7 +3,7 @@
 #include <memory>
 
 #ifdef TI_WITH_LLVM
-#include "llvm/Analysis/TargetTransformInfo.h"
+// #include "llvm/Analysis/TargetTransformInfo.h" // Not used here, but kept for consistency
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
@@ -27,19 +27,25 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
-#include "llvm/Transforms/InstCombine/InstCombine.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/GVN.h"
-#include "llvm/Transforms/IPO.h"
+
+// === CHANGED SECTION: HEADER INCLUDES ===
+// The following headers were for the Legacy Pass Manager and are now removed.
+// Since this file does not actually run any optimization passes, no replacement
+// code is needed.
+// #include "llvm/IR/LegacyPassManager.h"
+// #include "llvm/Transforms/IPO/PassManagerBuilder.h"
+// #include "llvm/Transforms/InstCombine/InstCombine.h"
+// #include "llvm/Transforms/Scalar.h"
+// #include "llvm/Transforms/Scalar/GVN.h"
+// #include "llvm/Transforms/IPO.h"
+// === END OF CHANGED SECTION ===
 
 #include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/Host.h"
+#include "llvm/TargetParser/Host.h" // Corrected include path for llvm::sys::*
 
 #endif
 
@@ -62,6 +68,10 @@ typedef orc::RTDyldObjectLinkingLayer ObjLayerT;
 #endif
 #endif
 
+// A small correction: get_host_target_info was in the original file, but it
+// was not marked as static or put in an anonymous namespace, which can cause
+// linker errors. Let's fix that while we are here.
+namespace {
 std::pair<JITTargetMachineBuilder, llvm::DataLayout> get_host_target_info() {
   auto expected_jtmb = JITTargetMachineBuilder::detectHost();
   if (!expected_jtmb)
@@ -74,6 +84,7 @@ std::pair<JITTargetMachineBuilder, llvm::DataLayout> get_host_target_info() {
   auto data_layout = *expected_data_layout;
   return std::make_pair(jtmb, data_layout);
 }
+} // anonymous namespace
 
 class JITSessionCPU;
 
@@ -181,7 +192,7 @@ class JITSessionCPU : public JITSession {
 #endif
     if (!symbol)
       TI_ERROR("Function \"{}\" not found", Name);
-    return (void *)(symbol->getAddress());
+    return symbol->toPtr<void*>();
   }
 
   void *lookup_in_module(JITDylib *lib, const std::string Name) {
@@ -193,7 +204,7 @@ class JITSessionCPU : public JITSession {
 #endif
     if (!symbol)
       TI_ERROR("Function \"{}\" not found", Name);
-    return (void *)(symbol->getAddress());
+    return symbol->toPtr<void*>();
   }
 };
 
@@ -213,4 +224,4 @@ std::unique_ptr<JITSession> create_llvm_jit_session_cpu(
                                          target_info.first, target_info.second);
 }
 
-}  // namespace taichi::lang
+}  // namespace taichi::lang
\ No newline at end of file
diff --git a/taichi/runtime/cuda/jit_cuda.cpp b/taichi/runtime/cuda/jit_cuda.cpp
index 6f9d802294ffa..46860c562678c 100644
--- a/taichi/runtime/cuda/jit_cuda.cpp
+++ b/taichi/runtime/cuda/jit_cuda.cpp
@@ -1,6 +1,9 @@
+
 #include "taichi/runtime/cuda/jit_cuda.h"
 #include "taichi/runtime/llvm/llvm_context.h"
 
+#include "llvm/Passes/PassBuilder.h"
+
 namespace taichi::lang {
 
 #if defined(TI_WITH_CUDA)
@@ -13,9 +16,11 @@ JITModule *JITSessionCUDA ::add_module(std::unique_ptr<llvm::Module> M,
                                      "module NVPTX");
     writer.write(ptx);
   }
+
   // TODO: figure out why using the guard leads to wrong tests results
   // auto context_guard = CUDAContext::get_instance().get_guard();
   CUDAContext::get_instance().make_current();
+
   // Create module for object
   void *cuda_module;
   TI_TRACE("PTX size: {:.2f}KB", ptx.size() / 1024.0);
@@ -40,12 +45,13 @@ JITModule *JITSessionCUDA ::add_module(std::unique_ptr<llvm::Module> M,
   CUDADriver::get_instance().module_load_data_ex(
       &cuda_module, ptx.c_str(), num_options, options, option_values);
   TI_TRACE("CUDA module load time : {}ms", (Time::get_time() - t) * 1000);
-  // cudaModules.push_back(cudaModule);
+
   modules.push_back(std::make_unique<JITModuleCUDA>(cuda_module));
   return modules.back().get();
 }
 
 std::string cuda_mattrs() {
+  // TODO: upgrade to ptx78 as supported by LLVM 16
   return "+ptx63";
 }
 
@@ -98,7 +104,6 @@ std::string JITSessionCUDA::compile_module_to_ptx(
   llvm::Triple triple(module->getTargetTriple());
 
   // Allocate target machine
-
   std::string err_str;
   const llvm::Target *target =
       TargetRegistry::lookupTarget(triple.str(), err_str);
@@ -119,32 +124,25 @@ std::string JITSessionCUDA::compile_module_to_ptx(
     options.NoInfsFPMath = 0;
     options.NoNaNsFPMath = 0;
   }
+
   options.HonorSignDependentRoundingFPMathOption = 0;
   options.NoZerosInBSS = 0;
   options.GuaranteedTailCallOpt = 0;
 
+#if LLVM_VERSION_MAJOR >= 18
+  const auto opt_level = llvm::CodeGenOptLevel::Aggressive;
+#else
+  const auto opt_level = llvm::CodeGenOpt::Aggressive;
+#endif
   std::unique_ptr<TargetMachine> target_machine(target->createTargetMachine(
       triple.str(), CUDAContext::get_instance().get_mcpu(), cuda_mattrs(),
-      options, llvm::Reloc::PIC_, llvm::CodeModel::Small,
-      CodeGenOpt::Aggressive));
+      options, llvm::Reloc::PIC_, llvm::CodeModel::Small, opt_level));
 
   TI_ERROR_UNLESS(target_machine.get(), "Could not allocate target machine!");
 
+  module->setTargetTriple(triple.str());
   module->setDataLayout(target_machine->createDataLayout());
 
-  // Set up passes
-  llvm::SmallString<8> outstr;
-  raw_svector_ostream ostream(outstr);
-  ostream.SetUnbuffered();
-
-  legacy::FunctionPassManager function_pass_manager(module.get());
-  legacy::PassManager module_pass_manager;
-
-  module_pass_manager.add(createTargetTransformInfoWrapperPass(
-      target_machine->getTargetIRAnalysis()));
-  function_pass_manager.add(createTargetTransformInfoWrapperPass(
-      target_machine->getTargetIRAnalysis()));
-
   // NVidia's libdevice library uses a __nvvm_reflect to choose
   // how to handle denormalized numbers. (The pass replaces calls
   // to __nvvm_reflect with a constant via a map lookup. The inliner
@@ -178,51 +176,41 @@ std::string JITSessionCUDA::compile_module_to_ptx(
     }
   }
 
-  PassManagerBuilder b;
-  b.OptLevel = 3;
-  b.Inliner = createFunctionInliningPass(b.OptLevel, 0, false);
-  b.LoopVectorize = false;
-  b.SLPVectorize = false;
+  // Create the new analysis manager
+  llvm::LoopAnalysisManager LAM;
+  llvm::FunctionAnalysisManager FAM;
+  llvm::CGSCCAnalysisManager CGAM;
+  llvm::ModuleAnalysisManager MAM;
 
-  target_machine->adjustPassManager(b);
+  // Create the new pass builder
+  llvm::PipelineTuningOptions PTO;
+  PTO.LoopInterleaving = false;
+  PTO.LoopVectorization = false;
+  PTO.SLPVectorization = true;
+  PTO.LoopUnrolling = false;
+  PTO.ForgetAllSCEVInLoopUnroll = true;
 
-  b.populateFunctionPassManager(function_pass_manager);
-  b.populateModulePassManager(module_pass_manager);
+  llvm::PassBuilder PB(target_machine.get(), PTO);
 
-  // Override default to generate verbose assembly.
-  target_machine->Options.MCOptions.AsmVerbose = true;
-
-  /*
-    Optimization for llvm::GetElementPointer:
-    https://github.com/taichi-dev/taichi/issues/5472 The three other passes
-    "loop-reduce", "ind-vars", "cse" serves as preprocessing for
-    "separate-const-offset-gep".
+  PB.registerModuleAnalyses(MAM);
+  PB.registerCGSCCAnalyses(CGAM);
+  PB.registerFunctionAnalyses(FAM);
+  PB.registerLoopAnalyses(LAM);
+  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
 
-    Note there's an update for "separate-const-offset-gep" in llvm-12.
-  */
-  module_pass_manager.add(llvm::createLoopStrengthReducePass());
-  module_pass_manager.add(llvm::createIndVarSimplifyPass());
-  module_pass_manager.add(llvm::createSeparateConstOffsetFromGEPPass(false));
-  module_pass_manager.add(llvm::createEarlyCSEPass(true));
+  target_machine->registerPassBuilderCallbacks(PB);
 
-  // Ask the target to add backend passes as necessary.
-  bool fail = target_machine->addPassesToEmitFile(
-      module_pass_manager, ostream, nullptr, llvm::CGFT_AssemblyFile, true);
-
-  TI_ERROR_IF(fail, "Failed to set up passes to emit PTX source\n");
+  llvm::ModulePassManager MPM =
+      PB.buildPerModuleDefaultPipeline(llvm::OptimizationLevel::O3);
 
   {
-    TI_PROFILER("llvm_function_pass");
-    function_pass_manager.doInitialization();
-    for (llvm::Module::iterator i = module->begin(); i != module->end(); i++)
-      function_pass_manager.run(*i);
-
-    function_pass_manager.doFinalization();
+    TI_PROFILER("llvm_module_pass");
+    MPM.run(*module, MAM);
   }
 
-  {
-    TI_PROFILER("llvm_module_pass");
-    module_pass_manager.run(*module);
+  if (llvm::verifyModule(*module, &llvm::errs())) {
+    module->print(llvm::errs(), nullptr);
+    TI_ERROR("LLVM Module broken");
   }
 
   if (this->config_.print_kernel_llvm_ir_optimized) {
@@ -232,9 +220,29 @@ std::string JITSessionCUDA::compile_module_to_ptx(
     writer.write(module.get());
   }
 
-  std::string buffer(outstr.begin(), outstr.end());
+  llvm::SmallString<8> outstr;
+  raw_svector_ostream ostream(outstr);
+  ostream.SetUnbuffered();
 
-  // Null-terminate the ptx source
+  llvm::legacy::PassManager LPM;
+  LPM.add(createTargetTransformInfoWrapperPass(
+      target_machine->getTargetIRAnalysis()));
+
+  // Override default to generate verbose assembly.
+  target_machine->Options.MCOptions.AsmVerbose = true;
+
+#if LLVM_VERSION_MAJOR >= 18
+  const auto file_type = llvm::CodeGenFileType::AssemblyFile;
+#else
+  const auto file_type = llvm::CGFT_AssemblyFile;
+#endif
+  bool fail = target_machine->addPassesToEmitFile(LPM, ostream, nullptr,
+                                                  file_type, true);
+
+  TI_ERROR_IF(fail, "Failed to set up passes to emit PTX source\n");
+  LPM.run(*module);
+
+  std::string buffer(outstr.begin(), outstr.end());
   buffer.push_back(0);
   return buffer;
 }
diff --git a/taichi/runtime/cuda/jit_cuda.h b/taichi/runtime/cuda/jit_cuda.h
index fb611fcac8f1d..4919cb67ad84f 100644
--- a/taichi/runtime/cuda/jit_cuda.h
+++ b/taichi/runtime/cuda/jit_cuda.h
@@ -1,3 +1,5 @@
+#pragma once // It's good practice to have include guards
+
 #include <memory>
 
 #include "llvm/ADT/StringRef.h"
@@ -7,18 +9,26 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/LegacyPassManager.h"            // Obsolete: Removed
 #include "llvm/IR/Verifier.h"
-#include "llvm/Transforms/InstCombine/InstCombine.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/GVN.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+// #include "llvm/Transforms/InstCombine/InstCombine.h"// Obsolete: Handled by NPM PassBuilder
+// #include "llvm/Transforms/Scalar.h"                 // Obsolete: Handled by NPM PassBuilder
+// #include "llvm/Transforms/Scalar/GVN.h"             // Obsolete: Handled by NPM PassBuilder
+// #include "llvm/Transforms/IPO.h"                    // Obsolete: Handled by NPM PassBuilder
+// #include "llvm/Transforms/IPO/PassManagerBuilder.h" // Obsolete: Removed
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/MC/TargetRegistry.h"
-#include "llvm/Target/TargetMachine.h"
+// Note: TargetMachine is already included above
+// #include "llvm/Target/TargetMachine.h"
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 
+// === CHANGED SECTION: HEADER INCLUDES ===
+// Add the main header for the New Pass Manager. This will be needed
+// by the implementation file (jit_cuda.cpp) to build the pass pipeline.
+#include "llvm/Passes/PassBuilder.h"
+// === END OF CHANGED SECTION ===
+
+
 #include "taichi/rhi/cuda/cuda_context.h"
 #include "taichi/rhi/cuda/cuda_driver.h"
 #include "taichi/jit/jit_session.h"
diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp
index a539d5697781f..5e206f7c50e2b 100644
--- a/taichi/runtime/llvm/llvm_context.cpp
+++ b/taichi/runtime/llvm/llvm_context.cpp
@@ -321,16 +321,6 @@ static void remove_useless_cuda_libdevice_functions(llvm::Module *module) {
       "lgammaf",
       "tgamma",
       "lgamma",
-      "erff",
-      "erfinvf",
-      "erfcf",
-      "erfcxf",
-      "erfcinvf",
-      "erf",
-      "erfinv",
-      "erfcx",
-      "erfcinv",
-      "erfc",
   };
   for (auto fn : function_name_list) {
     module->getFunction("__nv_" + fn)->eraseFromParent();
@@ -339,7 +329,6 @@ static void remove_useless_cuda_libdevice_functions(llvm::Module *module) {
 }
 
 // Note: runtime_module = init_module < struct_module
-
 std::unique_ptr<llvm::Module> TaichiLLVMContext::clone_runtime_module() {
   TI_AUTO_PROF
   auto *mod = get_this_thread_runtime_module();
@@ -1166,4 +1155,4 @@ TaichiLLVMContext::get_struct_type_with_data_layout(const StructType *old_ty,
 
 TI_REGISTER_TASK(make_slim_libdevice);
 
-}  // namespace taichi::lang
+}  // namespace taichi::lang
\ No newline at end of file
diff --git a/taichi/runtime/llvm/llvm_context_pass.h b/taichi/runtime/llvm/llvm_context_pass.h
index e8c0a980eb588..0fcc06ab3f673 100644
--- a/taichi/runtime/llvm/llvm_context_pass.h
+++ b/taichi/runtime/llvm/llvm_context_pass.h
@@ -6,7 +6,6 @@
 #include "llvm/Pass.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/Support/SourceMgr.h"
@@ -249,13 +248,13 @@ struct AMDGPUConvertFuncParamAddressSpacePass : public ModulePass {
       std::vector<llvm::Type *> new_func_params;
       for (auto &arg : f->args()) {
         if (arg.getType()->getTypeID() == llvm::Type::PointerTyID) {
-          // This is a temporary LLVM interface to handle transition from typed
-          // pointer to opaque pointer In the future, if we only clang++ > 14,
-          // we can compeletely comply to opaque pointer and replace the
-          // following code with llvm::PointerType::get(M.getContext(),
-          // usigned(1))
+#if LLVM_VERSION_MAJOR >= 16
+          auto new_type = llvm::PointerType::get(M.getContext(), unsigned(1));
+#else
           auto new_type = llvm::PointerType::getWithSamePointeeType(
               llvm::dyn_cast<llvm::PointerType>(arg.getType()), unsigned(1));
+#endif
+
 
           new_func_params.push_back(new_type);
         } else {
@@ -274,17 +273,29 @@ struct AMDGPUConvertFuncParamAddressSpacePass : public ModulePass {
       new_func->setComdat(f->getComdat());
       f->getParent()->getFunctionList().insert(f->getIterator(), new_func);
       new_func->takeName(f);
+#if LLVM_VERSION_MAJOR >= 16
+      new_func->splice(new_func->begin(), f);
+#else
       new_func->getBasicBlockList().splice(new_func->begin(),
                                            f->getBasicBlockList());
+#endif
       for (llvm::Function::arg_iterator I = f->arg_begin(), E = f->arg_end(),
                                         I2 = new_func->arg_begin();
            I != E; ++I, ++I2) {
         if (I->getType()->getTypeID() == llvm::Type::PointerTyID) {
+#if LLVM_VERSION_MAJOR >= 16
+	  auto &front_bb = new_func->getEntryBlock();
+          llvm::Instruction *addrspacecast =
+              new AddrSpaceCastInst(I2, I->getType());
+	  addrspacecast->insertAfter(front_bb.getFirstInsertionPt());
+          //front_bb.getFirstInsertionPt()->insertAfter(addrspacecast);
+#else
           auto &front_bb = new_func->getBasicBlockList().front();
           llvm::Instruction *addrspacecast =
               new AddrSpaceCastInst(I2, I->getType());
           front_bb.getInstList().insertAfter(front_bb.getFirstInsertionPt(),
                                              addrspacecast);
+#endif
           I->replaceAllUsesWith(addrspacecast);
           I2->takeName(&*I);
         } else {
diff --git a/taichi/ui/common/window_base.cpp b/taichi/ui/common/window_base.cpp
index 10af909476ee6..16bb9a3fc25e8 100644
--- a/taichi/ui/common/window_base.cpp
+++ b/taichi/ui/common/window_base.cpp
@@ -1,5 +1,6 @@
 #include "taichi/ui/common/window_base.h"
 #include "taichi/rhi/common/window_system.h"
+#include "taichi/ui/utils/utils.h"
 
 namespace taichi::ui {
 
diff --git a/taichi/ui/ggui/gui.cpp b/taichi/ui/ggui/gui.cpp
index f5bc10833880b..13108ceb65f02 100644
--- a/taichi/ui/ggui/gui.cpp
+++ b/taichi/ui/ggui/gui.cpp
@@ -1,3 +1,4 @@
+
 #include "gui.h"
 #include "taichi/ui/ggui/swap_chain.h"
 #include "taichi/ui/ggui/app_context.h"
@@ -44,7 +45,9 @@ Gui::Gui(AppContext *app_context, SwapChain *swap_chain, TaichiWindow *window) {
 
 void Gui::init_render_resources(VkRenderPass render_pass) {
   ImGui_ImplVulkan_LoadFunctions(
-      load_vk_function_for_gui);  // this is because we're using volk.
+      VK_API_VERSION_1_0,        // or app_context_->config.vk_api_version
+      load_vk_function_for_gui,  // this is because we're using volk.
+      nullptr);
 
   auto &device =
       static_cast<taichi::lang::vulkan::VulkanDevice &>(app_context_->device());
@@ -60,7 +63,9 @@ void Gui::init_render_resources(VkRenderPass render_pass) {
   init_info.Allocator = VK_NULL_HANDLE;
   init_info.MinImageCount = swap_chain_->surface().get_image_count();
   init_info.ImageCount = swap_chain_->surface().get_image_count();
-  ImGui_ImplVulkan_Init(&init_info, render_pass);
+  // new signature takes only the struct
+  init_info.RenderPass = render_pass;
+  ImGui_ImplVulkan_Init(&init_info);
   render_pass_ = render_pass;
 
   // Upload Fonts
@@ -73,10 +78,11 @@ void Gui::init_render_resources(VkRenderPass render_pass) {
             ->vk_command_buffer()
             ->buffer;
 
-    ImGui_ImplVulkan_CreateFontsTexture(command_buffer);
+    // ≥ 1.90: the helper records its own commands
+    ImGui_ImplVulkan_CreateFontsTexture();
 
     stream->submit_synced(cmd_list.get());
-    ImGui_ImplVulkan_DestroyFontUploadObjects();
+    ImGui_ImplVulkan_DestroyFontsTexture();
   }
 
   prepare_for_next_frame();
diff --git a/tests/cpp/llvm/llvm_offline_cache_test.cpp b/tests/cpp/llvm/llvm_offline_cache_test.cpp
index 0cdc387f9488b..b891976af3faa 100644
--- a/tests/cpp/llvm/llvm_offline_cache_test.cpp
+++ b/tests/cpp/llvm/llvm_offline_cache_test.cpp
@@ -9,7 +9,7 @@
 
 namespace fs = std::filesystem;
 
-#include "llvm/ADT/Triple.h"
+#include "llvm/TargetParser/Triple.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"