Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 0 additions & 78 deletions docs/Doxyfile
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,6 @@ ALLOW_UNICODE_NAMES = NO

OUTPUT_LANGUAGE = English

# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all
# documentation generated by doxygen is written. Doxygen will use this
# information to generate all generated output in the proper direction.
# Possible values are: None, LTR, RTL and Context.
# The default value is: None.

OUTPUT_TEXT_DIRECTION = None

# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
# descriptions after the members that are listed in the file and class
# documentation (similar to Javadoc). Set to NO to disable this.
Expand Down Expand Up @@ -263,12 +255,6 @@ TAB_SIZE = 2

ALIASES =

# This tag can be used to specify a number of word-keyword mappings (TCL only).
# A mapping has the form "name=value". For example adding "class=itcl::class"
# will allow you to use the command class in the itcl::class meaning.

TCL_SUBST =

# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
# only. Doxygen will then generate output that is more tailored for C. For
# instance, some of the names that are used will be different. The list of all
Expand Down Expand Up @@ -1156,13 +1142,6 @@ CLANG_DATABASE_PATH =

ALPHABETICAL_INDEX = YES

# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
# which the alphabetical index list will be split.
# Minimum value: 1, maximum value: 20, default value: 5.
# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.

COLS_IN_ALPHA_INDEX = 5

# In case all classes in a project start with a common prefix, all classes will
# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
# can be used to specify a prefix (or a list of prefixes) that should be ignored
Expand Down Expand Up @@ -1290,15 +1269,6 @@ HTML_COLORSTYLE_SAT = 100

HTML_COLORSTYLE_GAMMA = 80

# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
# page will contain the date and time when the page was generated. Setting this
# to YES can help to show when doxygen was last run and thus if the
# documentation is up to date.
# The default value is: NO.
# This tag requires that the tag GENERATE_HTML is set to YES.

HTML_TIMESTAMP = NO

# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
# documentation will contain a main index with vertical navigation menus that
# are dynamically created via JavaScript. If disabled, the navigation index will
Expand Down Expand Up @@ -1580,17 +1550,6 @@ EXT_LINKS_IN_WINDOW = NO

FORMULA_FONTSIZE = 10

# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
# generated for formulas are transparent PNGs. Transparent PNGs are not
# supported properly for IE 6.0, but are supported on all modern browsers.
#
# Note that when changing this option you need to delete any form_*.png files in
# the HTML output directory before the changes have effect.
# The default value is: YES.
# This tag requires that the tag GENERATE_HTML is set to YES.

FORMULA_TRANSPARENT = YES

# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
# to create new LaTeX commands to be used in formulas as building blocks. See
# the section "Including formulas" for details.
Expand Down Expand Up @@ -1889,16 +1848,6 @@ LATEX_BATCHMODE = NO

LATEX_HIDE_INDICES = NO

# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
# code with syntax highlighting in the LaTeX output.
#
# Note that which sources are shown also depends on other settings such as
# SOURCE_BROWSER.
# The default value is: NO.
# This tag requires that the tag GENERATE_LATEX is set to YES.

LATEX_SOURCE_CODE = NO

# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
# bibliography, e.g. plainnat, or ieeetr. See
# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
Expand All @@ -1907,14 +1856,6 @@ LATEX_SOURCE_CODE = NO

LATEX_BIB_STYLE = plain

# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
# page will contain the date and time when the page was generated. Setting this
# to NO can help when comparing the output of multiple runs.
# The default value is: NO.
# This tag requires that the tag GENERATE_LATEX is set to YES.

LATEX_TIMESTAMP = NO

# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
# path from which the emoji images will be read. If a relative path is entered,
# it will be relative to the LATEX_OUTPUT directory. If left blank the
Expand Down Expand Up @@ -1979,16 +1920,6 @@ RTF_STYLESHEET_FILE =

RTF_EXTENSIONS_FILE =

# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
# with syntax highlighting in the RTF output.
#
# Note that which sources are shown also depends on other settings such as
# SOURCE_BROWSER.
# The default value is: NO.
# This tag requires that the tag GENERATE_RTF is set to YES.

RTF_SOURCE_CODE = NO

#---------------------------------------------------------------------------
# Configuration options related to the man page output
#---------------------------------------------------------------------------
Expand Down Expand Up @@ -2085,15 +2016,6 @@ GENERATE_DOCBOOK = NO

DOCBOOK_OUTPUT = docbook

# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
# program listings (including syntax highlighting and cross-referencing
# information) to the DOCBOOK output. Note that enabling this will significantly
# increase the size of the DOCBOOK output.
# The default value is: NO.
# This tag requires that the tag GENERATE_DOCBOOK is set to YES.

DOCBOOK_PROGRAMLISTING = NO

#---------------------------------------------------------------------------
# Configuration options for the AutoGen Definitions output
#---------------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion docs/api/pytorch.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ PyTorch
:members: reset, get_states, set_states, add, fork


.. autoapifunction:: transformer_engine.pytorch.autocast
.. autoapiclass:: transformer_engine.pytorch.autocast(enabled=True, calibrating=False, recipe=None, amax_reduction_group=None)

.. autoapifunction:: transformer_engine.pytorch.quantized_model_init

Expand Down
1 change: 1 addition & 0 deletions examples/jax/collective_gemm/run_test_cgemm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -143,5 +143,6 @@ wait

# Final cleanup (trap will also call cleanup on exit)
cleanup
wait

exit $HAS_FAILURE
1 change: 1 addition & 0 deletions examples/jax/encoder/run_test_multiprocessing_encoder.sh
Original file line number Diff line number Diff line change
Expand Up @@ -98,5 +98,6 @@ wait

# Final cleanup (trap will also call cleanup on exit)
cleanup
wait

exit $HAS_FAILURE
40 changes: 27 additions & 13 deletions transformer_engine/common/cast/mxfp8/quantize_mxfp8.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -643,15 +643,35 @@ void quantize(const Tensor &input, const Tensor *act_input, const Tensor *noop,
TRANSFORMER_ENGINE_SWITCH_CONDITION(
with_gemm_swizzled_scales, WITH_GEMM_SWIZZLED_SCALES,

// The specialized rowwise cast-only kernel vectorizes full 128-element chunks.
// Shapes with a partial row tail (for example, N=48) must use the generic kernel,
// otherwise the last chunk reads/writes past the logical end of the row.
using rowwise_traits = specialized::CastTraits<IType, OType, true, false>;
using bidimensional_traits = specialized::CastTraits<IType, OType, true, true>;
constexpr size_t max_grid_dim_y = 65535;
const bool rowwise_specialized_grid_fits =
((rows + rowwise_traits::blockDimM - 1) / rowwise_traits::blockDimM) <=
max_grid_dim_y;
const bool bidimensional_specialized_grid_fits =
((rows + bidimensional_traits::blockDIM::M - 1) /
bidimensional_traits::blockDIM::M) <= max_grid_dim_y;

const bool is_full_rowwise_chunk = (cols % 128 == 0);
const bool scaling_type_has_specialized_support =
(scaling_type == ScalingType::ROWWISE && is_full_rowwise_chunk &&
rowwise_specialized_grid_fits) ||
(scaling_type == ScalingType::BIDIMENSIONAL &&
bidimensional_specialized_grid_fits);

if (specialized::hasSpec<IS_DBIAS, IS_DACT, IS_ACT, IType, OType>() &&
!WITH_GEMM_SWIZZLED_SCALES) {
!WITH_GEMM_SWIZZLED_SCALES && scaling_type_has_specialized_support) {
switch (scaling_type) {
case ScalingType::ROWWISE: {
using traits = specialized::CastTraits<IType, OType, true, false>;
auto kernel = specialized::quantize_mxfp8_kernel_cast_only<traits>;

cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
traits::smem);
NVTE_CHECK_CUDA(cudaFuncSetAttribute(
kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, traits::smem));

dim3 block(traits::threadLayout::num, traits::warpLayout::N,
traits::warpLayout::M);
Expand All @@ -664,16 +684,12 @@ void quantize(const Tensor &input, const Tensor *act_input, const Tensor *noop,

break;
}
case ScalingType::COLWISE: {
NVTE_WARN("Colwise scaling will fallback to original kernel.");
break;
}
case ScalingType::BIDIMENSIONAL: {
using traits = specialized::CastTraits<IType, OType, true, true>;
auto kernel = specialized::quantize_mxfp8_kernel_cast_only<traits>;

cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
traits::smem);
NVTE_CHECK_CUDA(cudaFuncSetAttribute(
kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, traits::smem));
// TMA for loading, so that we don't need STS for transposing
alignas(64) CUtensorMap tensor_map_input{};
constexpr size_t input_type_bit_size = TypeInfo<IType>::size;
Expand Down Expand Up @@ -710,6 +726,7 @@ void quantize(const Tensor &input, const Tensor *act_input, const Tensor *noop,
NVTE_ERROR("Invalid scaling type.");
}
}
NVTE_CHECK_CUDA(cudaGetLastError());
return;
}

Expand Down Expand Up @@ -789,7 +806,6 @@ void quantize(const Tensor &input, const Tensor *act_input, const Tensor *noop,
tensor_map_output_colwise, scales_rowwise_ptr, scales_colwise_ptr, noop_ptr,
workspace_ptr, amax_ptr, rows, cols, scale_stride_rowwise,
scale_stride_colwise);
NVTE_CHECK_CUDA(cudaGetLastError());
break;
}
case ScalingType::COLWISE: {
Expand All @@ -804,7 +820,6 @@ void quantize(const Tensor &input, const Tensor *act_input, const Tensor *noop,
tensor_map_output_colwise, scales_rowwise_ptr, scales_colwise_ptr, noop_ptr,
workspace_ptr, amax_ptr, rows, cols, scale_stride_rowwise,
scale_stride_colwise);
NVTE_CHECK_CUDA(cudaGetLastError());
break;
}
case ScalingType::BIDIMENSIONAL: {
Expand All @@ -819,10 +834,9 @@ void quantize(const Tensor &input, const Tensor *act_input, const Tensor *noop,
tensor_map_output_colwise, scales_rowwise_ptr, scales_colwise_ptr, noop_ptr,
workspace_ptr, amax_ptr, rows, cols, scale_stride_rowwise,
scale_stride_colwise);
NVTE_CHECK_CUDA(cudaGetLastError());
break;
}
}
} NVTE_CHECK_CUDA(cudaGetLastError());

if constexpr (IS_DBIAS) {
common::reduce_dbias<IType>(workspace_ptr, dbias, dbias_rows, dbias_cols, stream);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,18 +91,6 @@ __device__ __forceinline__ e8m0_t to_e8m0(IType amax) {
#endif // #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
} // anonymous namespace

inline bool is_cast_only_enabled() {
static bool enabled = []() {
const char *env = std::getenv("ENABLE_CAST_ONLY");
return env != nullptr && (env[0] == '1');
}();
return enabled;

// // FIXME: when finish debugging, remove this
// const char* env = std::getenv("ENABLE_CAST_ONLY");
// return env != nullptr && (env[0] == '1');
}

template <bool IS_DBIAS, bool IS_DACT, bool IS_ACT, typename IType, typename OType>
inline bool hasSpec() {
return false;
Expand All @@ -112,19 +100,19 @@ inline bool hasSpec() {
// OType could be [fp8e5m2, fp8e4m3]
template <>
inline bool hasSpec<false, false, false, fp16, fp8e5m2>() {
return is_cast_only_enabled();
return true;
}
template <>
inline bool hasSpec<false, false, false, fp16, fp8e4m3>() {
return is_cast_only_enabled();
return true;
}
template <>
inline bool hasSpec<false, false, false, bf16, fp8e5m2>() {
return is_cast_only_enabled();
return true;
}
template <>
inline bool hasSpec<false, false, false, bf16, fp8e4m3>() {
return is_cast_only_enabled();
return true;
}

template <int32_t _M, int32_t _N>
Expand Down
Loading
Loading