Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
2 changes: 1 addition & 1 deletion 3rd/cutlass/include/cute/algorithm/axpby.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
Expand Down
2 changes: 1 addition & 1 deletion 3rd/cutlass/include/cute/algorithm/clear.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
Expand Down
2 changes: 1 addition & 1 deletion 3rd/cutlass/include/cute/algorithm/cooperative_copy.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2017 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
Expand Down
2 changes: 1 addition & 1 deletion 3rd/cutlass/include/cute/algorithm/cooperative_gemm.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2017 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
Expand Down
15 changes: 6 additions & 9 deletions 3rd/cutlass/include/cute/algorithm/copy.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -254,19 +254,16 @@ copy(AutoVectorizingCopyWithAssumedAlignment<MaxVecBits> const&,
if constexpr (common_elem > 1)
{
constexpr int align_bits = CUTE_STATIC_V(gcd(max_alignment(src), max_alignment(dst), Int<MaxVecBits>{}));
constexpr int vec_bits = gcd(common_elem * sizeof_bits_v<typename SrcEngine::value_type>, align_bits);
constexpr int vec_bits = gcd(common_elem * sizeof_bits_v<typename DstEngine::value_type>, align_bits);

if constexpr ((vec_bits % 8) == 0)
if constexpr ((vec_bits % 8) == 0 && sizeof_bits_v<typename DstEngine::value_type> < Int<vec_bits>{})
{
// If more than one element vectorizes to 8bits or more, then recast and copy
// If more than one element vectorizes to a multiple of 8bits that is larger than the value_type, then recast and copy
using VecType = uint_bit_t<vec_bits>;
// Preserve volatility
using SrcVecType = conditional_t<is_volatile_v<typename SrcEngine::element_type>, VecType const volatile, VecType const>;
using DstVecType = conditional_t<is_volatile_v<typename DstEngine::element_type>, VecType volatile, VecType >;

// Recast
Tensor src_v = recast<SrcVecType>(src);
Tensor dst_v = recast<DstVecType>(dst);
Tensor src_v = recast<VecType>(src);
Tensor dst_v = recast<VecType>(dst);
return copy_if(constant_fn<true_type>{}, src_v, dst_v);
} else {
return copy_if(constant_fn<true_type>{}, src, dst);
Expand Down
2 changes: 1 addition & 1 deletion 3rd/cutlass/include/cute/algorithm/fill.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
Expand Down
2 changes: 1 addition & 1 deletion 3rd/cutlass/include/cute/algorithm/functional.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
Expand Down
2 changes: 1 addition & 1 deletion 3rd/cutlass/include/cute/algorithm/gemm.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
Expand Down
2 changes: 1 addition & 1 deletion 3rd/cutlass/include/cute/algorithm/prefer.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
Expand Down
2 changes: 1 addition & 1 deletion 3rd/cutlass/include/cute/algorithm/prefetch.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2024 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
Expand Down
2 changes: 1 addition & 1 deletion 3rd/cutlass/include/cute/algorithm/tensor_algorithms.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
Expand Down
2 changes: 1 addition & 1 deletion 3rd/cutlass/include/cute/algorithm/tensor_reduce.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2025 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
Expand Down
2 changes: 1 addition & 1 deletion 3rd/cutlass/include/cute/algorithm/tuple_algorithms.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
Expand Down
2 changes: 1 addition & 1 deletion 3rd/cutlass/include/cute/arch/cluster_sm100.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
Expand Down
2 changes: 1 addition & 1 deletion 3rd/cutlass/include/cute/arch/cluster_sm90.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
Expand Down
71 changes: 56 additions & 15 deletions 3rd/cutlass/include/cute/arch/config.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2024 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -54,25 +54,28 @@
////////////////////////////////////////////////////////////////////////////////////////////////////

#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED) ||\
defined(CUTLASS_ARCH_MMA_SM120A_ENABLED))
defined(CUTLASS_ARCH_MMA_SM103A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM120A_ENABLED) ||\
defined(CUTLASS_ARCH_MMA_SM121A_ENABLED))
# define CUTE_ARCH_TMA_SM90_ENABLED
# define CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED
# define CUTE_ARCH_STSM_SM90_ENABLED
#endif

#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED))
#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED) ||\
defined(CUTLASS_ARCH_MMA_SM103A_ENABLED))
# define CUTE_ARCH_TCGEN05_TF32_MMA_ENABLED
# define CUTE_ARCH_TCGEN05_F16F32_MMA_ENABLED
# define CUTE_ARCH_TCGEN05_MXF8F6F4_MMA_ENABLED
# define CUTE_ARCH_TCGEN05_MXF4_MMA_ENABLED
# define CUTE_ARCH_TCGEN05_MXF4NVF4_MMA_ENABLED
#endif

#if defined(CUTLASS_ARCH_MMA_SM100A_ENABLED)
#if defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM103A_ENABLED)
# define CUTE_ARCH_TCGEN05_F16BF16_MMA_SCALED_ENABLED
#endif

#if (defined(CUTLASS_ARCH_MMA_SM100F_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101F_ENABLED))
#if (defined(CUTLASS_ARCH_MMA_SM100F_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101F_ENABLED) ||\
defined(CUTLASS_ARCH_MMA_SM103F_ENABLED))
# define CUTE_ARCH_TMA_SM90_ENABLED
# define CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED
# define CUTE_ARCH_STSM_SM90_ENABLED
Expand All @@ -83,32 +86,59 @@
# define CUTE_ARCH_TCGEN05_MXF4NVF4_MMA_ENABLED
#endif

#if defined(CUTLASS_ARCH_MMA_SM100F_ENABLED)
#if defined(CUTLASS_ARCH_MMA_SM100F_ENABLED) || defined(CUTLASS_ARCH_MMA_SM103F_ENABLED)
# define CUTE_ARCH_TCGEN05_F16BF16_MMA_SCALED_ENABLED
#endif

#if (defined(CUTLASS_ARCH_MMA_SM120F_ENABLED))
#if (defined(CUTLASS_ARCH_MMA_SM120F_ENABLED) || defined(CUTLASS_ARCH_MMA_SM121F_ENABLED))
# define CUTE_ARCH_TMA_SM90_ENABLED
# define CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED
# define CUTE_ARCH_STSM_SM90_ENABLED
#endif


// SM110 specific configs
#if (defined(CUTLASS_ARCH_MMA_SM110A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM110F_ENABLED))
# define CUTE_ARCH_TMA_SM90_ENABLED
# define CUTE_ARCH_DEVICE_MODIFIABLE_TMA_SM90_ENABLED
# define CUTE_ARCH_STSM_SM90_ENABLED
# define CUTE_ARCH_TCGEN05_TF32_MMA_ENABLED
# define CUTE_ARCH_TCGEN05_F16F32_MMA_ENABLED
# define CUTE_ARCH_TCGEN05_MXF8F6F4_MMA_ENABLED
# define CUTE_ARCH_TCGEN05_MXF4_MMA_ENABLED
# define CUTE_ARCH_TCGEN05_MXF4NVF4_MMA_ENABLED
# define CUTE_ARCH_TCGEN05_S8_MMA_ENABLED
# define CUTE_ARCH_LDSM_SM100A_ENABLED
# define CUTE_ARCH_STSM_SM100A_ENABLED
# define CUTE_ARCH_TCGEN05_TMEM_ENABLED
# define CUTE_ARCH_TMA_SM100_ENABLED
# define CUTE_ARCH_LOAD256_SM100A_ENABLED
# define CUTE_ARCH_STORE256_SM100A_ENABLED
# define CUTE_ARCH_FLOAT2_MATH_ENABLED
#endif

#if (defined(CUTLASS_ARCH_MMA_SM110A_ENABLED))
# define CUTE_ARCH_TCGEN05_S8_MMA_ENABLED
#endif

#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED))
# define CUTE_ARCH_TCGEN05_S8_MMA_ENABLED
#endif

#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED) ||\
defined(CUTLASS_ARCH_MMA_SM120A_ENABLED))
defined(CUTLASS_ARCH_MMA_SM103A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM120A_ENABLED) ||\
defined(CUTLASS_ARCH_MMA_SM120A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM121A_ENABLED))
# define CUTE_ARCH_LDSM_SM100A_ENABLED
# define CUTE_ARCH_STSM_SM100A_ENABLED
#endif

#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED))
#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED) ||\
defined(CUTLASS_ARCH_MMA_SM103A_ENABLED))
# define CUTE_ARCH_TCGEN05_TMEM_ENABLED
#endif

#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED))
#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED) ||\
defined(CUTLASS_ARCH_MMA_SM103A_ENABLED))
# define CUTE_ARCH_TMA_SM100_ENABLED
#endif

Expand All @@ -120,21 +150,26 @@
# define CUTE_ARCH_FLOAT2_MATH_ENABLED
#endif

#if defined(CUTLASS_ARCH_MMA_SM120_ENABLED) || defined(CUTLASS_ARCH_MMA_SM120A_ENABLED)
#if (defined(CUTLASS_ARCH_MMA_SM120_ENABLED) || defined(CUTLASS_ARCH_MMA_SM120A_ENABLED) ||\
defined(CUTLASS_ARCH_MMA_SM121_ENABLED) || defined(CUTLASS_ARCH_MMA_SM121A_ENABLED))
# define CUTE_ARCH_MMA_SM120_ENABLED
# define CUTE_ARCH_TMA_SM120_ENABLED
#endif

#if defined(CUTLASS_ARCH_MMA_SM120_ENABLED) || defined(CUTLASS_ARCH_MMA_SM120A_ENABLED)
#if (defined(CUTLASS_ARCH_MMA_SM120_ENABLED) || defined(CUTLASS_ARCH_MMA_SM120A_ENABLED) ||\
defined(CUTLASS_ARCH_MMA_SM121_ENABLED) || defined(CUTLASS_ARCH_MMA_SM121A_ENABLED))
# if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8))
# define CUTE_ARCH_F8F6F4_MMA_ENABLED
# define CUTE_ARCH_MXF8F6F4_MMA_ENABLED
# define CUTE_ARCH_MXF4NVF4_2X_UE8M0_MMA_ENABLED
# define CUTE_ARCH_MXF4NVF4_4X_UE4M3_MMA_ENABLED
# endif
# if (__CUDACC_VER_MAJOR__ == 13 && __CUDACC_VER_MINOR__ >= 1)
# define CUTE_ARCH_MXF4NVF4_4X_UE8M0_MMA_ENABLED
# endif
#endif

#if defined(CUTLASS_ARCH_MMA_SM100F_ENABLED)
#if defined(CUTLASS_ARCH_MMA_SM100F_ENABLED) || defined(CUTLASS_ARCH_MMA_SM103F_ENABLED)
# define CUTE_ARCH_LDSM_SM100A_ENABLED
# define CUTE_ARCH_STSM_SM100A_ENABLED
# define CUTE_ARCH_TCGEN05_TMEM_ENABLED
Expand All @@ -149,14 +184,16 @@
# define CUTE_ARCH_TMA_SM100_ENABLED
#endif

#if defined(CUTLASS_ARCH_MMA_SM120F_ENABLED)
#if (defined(CUTLASS_ARCH_MMA_SM120F_ENABLED) || defined(CUTLASS_ARCH_MMA_SM121F_ENABLED))
# define CUTE_ARCH_LDSM_SM100A_ENABLED
# define CUTE_ARCH_STSM_SM100A_ENABLED
#endif

#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM100F_ENABLED) ||\
defined(CUTLASS_ARCH_MMA_SM101A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101F_ENABLED) ||\
defined(CUTLASS_ARCH_MMA_SM120A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM120F_ENABLED))
defined(CUTLASS_ARCH_MMA_SM103A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM103F_ENABLED) ||\
defined(CUTLASS_ARCH_MMA_SM120A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM120F_ENABLED) ||\
defined(CUTLASS_ARCH_MMA_SM121A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM121F_ENABLED))
# if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9))
# define CUTE_ARCH_LOAD256_SM100A_ENABLED
# define CUTE_ARCH_STORE256_SM100A_ENABLED
Expand All @@ -168,3 +205,7 @@
#define CUTE_ARCH_FLOAT2_MATH_ENABLED
#endif

#if defined(CUTLASS_ARCH_MMA_SM103_ENABLED) || defined(CUTLASS_ARCH_MMA_SM100F_ENABLED)
# define CUTE_ARCH_TCGEN05_MXF4NVF4_MMA_ULTRA_ENABLED
#endif

2 changes: 1 addition & 1 deletion 3rd/cutlass/include/cute/arch/copy.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
Expand Down
47 changes: 46 additions & 1 deletion 3rd/cutlass/include/cute/arch/copy_sm100.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2023 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -41,6 +41,51 @@ namespace cute {

////////////////////////////////////////////////////////////////////////////////////////////////////

////////////////////////////////////////////////////////////////////////////////////////////////////
//
// Global Memory Load and Store PTX definitions
//
////////////////////////////////////////////////////////////////////////////////////////////////////

struct SM100_LOAD_256bit_CACHE_NOALLOCATION
{
using SRegisters = uint256_t[1];
using DRegisters = uint32_t[8];

CUTE_HOST_DEVICE static void
copy(uint256_t const& gmem_addr,
uint32_t& dst0, uint32_t& dst1, uint32_t& dst2, uint32_t& dst3,
uint32_t& dst4, uint32_t& dst5, uint32_t& dst6, uint32_t& dst7)
{
#if defined(CUTE_ARCH_LOAD256_SM100A_ENABLED)
asm volatile("ld.global.L1::no_allocate.v8.f32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];\n"
: "=r"(dst0), "=r"(dst1), "=r"(dst2), "=r"(dst3), "=r"(dst4), "=r"(dst5), "=r"(dst6), "=r"(dst7)
: "l"(&gmem_addr) );
#else
CUTE_INVALID_CONTROL_PATH("Trying to use LOAD.256 without CUTE_ARCH_LOAD256_SM100A_ENABLED.");
#endif
}
};

struct SM100_STORE_256bit_CACHE_NOALLOCATION
{
using SRegisters = uint32_t[8];
using DRegisters = uint256_t[1];

CUTE_HOST_DEVICE static void
copy(uint32_t const& src0, uint32_t const& src1, uint32_t const& src2, uint32_t const& src3,
uint32_t const& src4, uint32_t const& src5, uint32_t const& src6, uint32_t const& src7,
uint256_t& gmem_addr)
{
#if defined(CUTE_ARCH_STORE256_SM100A_ENABLED)
asm volatile("st.global.L1::no_allocate.v8.f32 [%0], {%1, %2, %3, %4, %5, %6, %7, %8};\n"
:: "l"(&gmem_addr), "r"(src0), "r"(src1), "r"(src2), "r"(src3), "r"(src4), "r"(src5), "r"(src6), "r"(src7));
#else
CUTE_INVALID_CONTROL_PATH("Trying to use stg.256 without CUTE_ARCH_STORE256_SM100A_ENABLED.");
#endif
}
};

////////////////////////////////////////////////////////////////////////////////////////////////////
//
// LDSM PTX definitions
Expand Down
4 changes: 3 additions & 1 deletion 3rd/cutlass/include/cute/arch/copy_sm100_tma.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2020 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2020 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -37,6 +37,8 @@

#include <cute/arch/copy.hpp>
#include <cute/arch/copy_sm90.hpp>
#include "cutlass/arch/synclog.hpp"

namespace cute
{

Expand Down
2 changes: 1 addition & 1 deletion 3rd/cutlass/include/cute/arch/copy_sm50.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/***************************************************************************************************
* Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2024 - 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*
* Redistribution and use in source and binary forms, with or without
Expand Down
Loading