Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions tests/ap/__main__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1 @@
# import test_trivial_reduce
# import test_binary_trivial_reduce
import test_matmul_binary
import test_matmul_epilogue
2 changes: 2 additions & 0 deletions tests/ap/kernel_arg_id_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def get_or_create_kernel_arg_id_manul_var_name(self, kernel_arg_id, cpp_var_name
return self.all_kernel_arg_id2unique_name.get_or_create(kernel_arg_id, create)

def get_in_tensor_data_ptr_var_name(self, in_ir_value_name):
print('in_ir_value_name: ', in_ir_value_name)

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

去掉

ir_value = getattr(self.tensor_match_ctx, in_ir_value_name)
kernel_arg_id = self.code_gen_ctx.in_tensor_data_ptr_kernel_arg_id(ir_value)
create = self._get_creator(kernel_arg_id, self._create_in_tensor_data_ptr_var_name)
Expand All @@ -29,6 +30,7 @@ def _create_in_tensor_data_ptr_var_name(self):
return name

def get_out_tensor_data_ptr_var_name(self, out_ir_value_name):
out_ir_value_name = out_ir_value_name.replace("out", "output")

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个replace是必须的吗?

@hxzd5568 hxzd5568 Apr 10, 2025

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

out_{i} 是程序的局部变量, output_{i} 是context中注册的ir_name,此命名可以相互区分。采用replace可以使得,局部变量找到全局变量。

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里没有解释为啥需要有"out", "output"字面量

ir_value = getattr(self.tensor_match_ctx, out_ir_value_name)
kernel_arg_id = self.code_gen_ctx.out_tensor_data_ptr_kernel_arg_id(ir_value)
create = self._get_creator(kernel_arg_id, self._create_out_tensor_data_ptr_var_name)
Expand Down
4 changes: 3 additions & 1 deletion tests/ap/make_axpr.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@ FILENAMES_ARRAY=(
"__main__"
"topo_drr_pass"
"op_convertion_drr_pass"
"umprime"
"access_topo_drr"
"abstract_drr"
"matmul_epilogue_pass_of_remove_functions"
"ap_tpl_codegen"
"matmul_binary_tpl"
"test_matmul_binary"
"test_matmul_epilogue"
)
for filename in "${FILENAMES_ARRAY[@]}"
do
Expand Down
13 changes: 11 additions & 2 deletions tests/ap/matmul/matmul.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,9 @@ struct GemmEpilogueParams {
std::vector<int64_t> input0_shape;
std::vector<int64_t> input1_shape;
std::vector<const void *> epilogue_in_ptrs;
std::vector<void *> epilogue_out_ptrs;
std::vector<std::vector<int64_t>> epilogue_in_shapes;
std::vector<std::vector<int64_t>> epilogue_out_shapes;

GemmEpilogueParams() {}
GemmEpilogueParams(cudaStream_t stream, const void *input, const void *weight,
Expand Down Expand Up @@ -156,16 +158,23 @@ struct GemmEpilogueParams {
shape_args.ldc_bias = (!bias || is_C_bias) ? 0 : n;
}

void SetEpilogues(const std::vector<const void *> &in_ptrs) {
void SetEpilogues(const std::vector<const void *> &in_ptrs,
const std::vector< void *> &out_ptrs) {
epilogue_in_ptrs = in_ptrs;
epilogue_out_ptrs = out_ptrs;
}

void
SetEpilogueAndShapes(const std::vector<const void *> &in_ptrs,
const std::vector<std::vector<int64_t>> &in_shapes) {
const std::vector<std::vector<int64_t>> &in_shapes,
const std::vector<void *> &out_ptrs,
const std::vector<std::vector<int64_t>> &out_shapes) {
ASSERT_CHECK(in_ptrs.size() == in_shapes.size());
epilogue_in_ptrs = in_ptrs;
epilogue_in_shapes = in_shapes;
ASSERT_CHECK(out_ptrs.size() == out_shapes.size());
epilogue_out_ptrs = out_ptrs;
epilogue_out_shapes = out_shapes;
}
};

Expand Down
6 changes: 4 additions & 2 deletions tests/ap/matmul/tests/matmul_binary_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,15 @@ void MatmulAddBinaryKernel(
cudaStream_t *stream, const void *input, const void *weight,
const void *bias, void *output,
const std::vector<const void *> &epilogue_ins,
const std::vector<void *> &epilogue_outs,
const std::vector<int64_t> &input_shape,
const std::vector<int64_t> &weight_shape,
const std::vector<int64_t> &bias_shape,
const std::vector<std::vector<int64_t>> &epilogue_shapes) {
const std::vector<std::vector<int64_t>> &epilogue_in_shapes,
const std::vector<std::vector<int64_t>> &epilogue_out_shapes) {
GemmEpilogueParams params(*stream, input, weight, bias, output, input_shape,
weight_shape, bias_shape);
params.SetEpilogueAndShapes(epilogue_ins, epilogue_shapes);
params.SetEpilogueAndShapes(epilogue_ins, epilogue_in_shapes, epilogue_outs, epilogue_out_shapes);

#if AP_ENABLE_AUTOTUNE
#if AP_USE_FLOAT16
Expand Down
76 changes: 61 additions & 15 deletions tests/ap/matmul_binary_tpl.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ def is_in_tensor_karg(kernel_arg_id):
)
return kernel_arg_id_type_name == "InTensorDataPtrKernelArgId"

def is_out_tensor_karg(kernel_arg_id):
kernel_arg_id_type_name = f"{type(kernel_arg_id)}".replace("<class '", "").replace(
"'>", ""
)
return kernel_arg_id_type_name == "OutTensorDataPtrKernelArgId"
Comment on lines +19 to +23

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里的代码质量不行,如果需要判断kernel_arg_id的类型,那就在c++代码里导出OutTensorDataPtrKernelArgId类变量到python层。

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

为啥要这个is_out_tensor_arg的逻辑?

@Xreki Xreki Apr 10, 2025

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里的代码质量不行,如果需要判断kernel_arg_id的类型,那就在c++代码里导出OutTensorDataPtrKernelArgId类变量到python层。

我上个PR中对输入指针参数是这么判断的,我的锅,我来改下。

为啥要这个is_out_tensor_arg的逻辑?

因为之前为Autotune功能设计的ProfileBestConfig函数声明如下,只能接收void(const GemmEpilogueParams &)这种形式的函数。AP里面生成的Kernel函数参数列表是不固定的,需要将所有的维度、指针参数都先存到GemmEpilogueParams中,因此需要区分karg类型来保存。

static int ProfileBestConfig(
    const std::vector<std::function<void(const GemmEpilogueParams &)>>
        &gemm_functions,
    const GemmEpilogueParams &params);

最近想到ProfileBestConfig即使不生成,应该也可以支持可变参数列表,后面有空了可以再来优化下。


class MatmulBinaryTemplate:
def __init__(
Expand All @@ -39,6 +44,7 @@ def __init__(
)
self.input_dim_karg_to_shape_access = MutableOrderedDict()
self.input_tensor_karg_to_shape_access = MutableOrderedDict()
self.output_tensor_karg_to_shape_access = MutableOrderedDict()
self.kernel_name = "MatmulBinaryKernel"
self.library_name = "matmul_binary_kernel"

Expand Down Expand Up @@ -105,6 +111,11 @@ def get_kernel_arg_runtime_getters(self):
lambda pair: pair[0].runtime_getter, all_kernel_arg_id_and_unique_names
)

def init_outputs(self):
out_tensor_data_nums = self.mut_kernel_arg_id_registry.out_tensor_data_ptr_seq_no
stmt = map(lambda i: f"out{i}", range(out_tensor_data_nums + 1))
return "T " + f", ".join(stmt) + ";"

def get_kernel_arg_types(self):
all_kernel_arg_id_and_unique_names = (
self.mut_kernel_arg_id_registry.all_kernel_arg_id2unique_name.items()
Expand Down Expand Up @@ -159,6 +170,7 @@ def get_epilogue_arguments_init_str(
def declare_epilogue_arguments_assign(pair):
kernel_arg_id = pair[0]
is_in_tensor_type = is_in_tensor_karg(kernel_arg_id)
is_out_tensor_type = is_out_tensor_karg(kernel_arg_id)

var_name = pair[1]
field_name = self.kernel_arg_translator.get_param_struct_field_name(
Expand All @@ -169,30 +181,36 @@ def get_in_tensor_statement():
param_name_for_var = self.input_tensor_karg_to_shape_access[var_name]
return f"reinterpret_cast<const {output_dtype} *>({params_name}.{param_name_for_var})"

def get_out_tensor_statement():
param_name_for_var = self.output_tensor_karg_to_shape_access[var_name]
return f"reinterpret_cast<{output_dtype} *>({params_name}.{param_name_for_var})"

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里难道不会有性能问题吗?

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里只是在Host端赋值一次,应该不会有太大开销,对应PR描述示例代码里面的如下部分:

  epilogue_args.in_ptr_0 = reinterpret_cast<const float *>(params.epilogue_in_ptrs[0]);
  epilogue_args.out_ptr_0 = reinterpret_cast<float *>(params.epilogue_out_ptrs[0]);


def get_dim_expr_statement():
param_name_for_var = self.input_dim_karg_to_shape_access[var_name]
return f"{params_name}.{param_name_for_var}"

statement = (
get_in_tensor_statement()
if is_in_tensor_type
else get_dim_expr_statement()
else get_out_tensor_statement()
if is_out_tensor_type
else get_dim_expr_statement()
)
return f"{obj_name}.{field_name} = {statement};"

generated_kernel_arg_id_and_names = (
self.mut_kernel_arg_id_registry.generated_kernel_arg_id2unique_name.items()
)

return f"\n{indent}".join(
map(declare_epilogue_arguments_assign, generated_kernel_arg_id_and_names)
)

def get_params_epilogue_ptrs_init_str(self, obj_name, indent):
def get_params_epilogue_ptrs_init_str(self, in_obj_name, out_obj_name, indent):
in_tensor_id = 0

def declare_params_epilogue_arguments_assign(pair):
def declare_in_params_epilogue_arguments_assign(pair):
def get_creator():
return f"{obj_name}[{in_tensor_id}]"
return f"{in_obj_name}[{in_tensor_id}]"

kernel_arg_id = pair[0]
is_in_tensor_type = is_in_tensor_karg(kernel_arg_id)
Expand All @@ -201,7 +219,7 @@ def generate_statement():
self.input_tensor_karg_to_shape_access.get_or_create(
pair[1], get_creator
)
statement = f"{obj_name}.push_back({pair[1]});"
statement = f"{in_obj_name}.push_back({pair[1]});"
in_tensor_id = in_tensor_id + 1
return statement

Expand All @@ -210,13 +228,39 @@ def generate_statement():
generated_kernel_arg_id_and_names = (
self.mut_kernel_arg_id_registry.generated_kernel_arg_id2unique_name.items()
)
return f"\n{indent}".join(
map(
declare_params_epilogue_arguments_assign,
in_str_list = map(
declare_in_params_epilogue_arguments_assign,
generated_kernel_arg_id_and_names,
)
)

out_tensor_id = 0
def declare_out_params_epilogue_arguments_assign(pair):
def get_creator():
return f"{out_obj_name}[{out_tensor_id}]"

kernel_arg_id = pair[0]
is_out_tensor_type = is_out_tensor_karg(kernel_arg_id)

def generate_statement():
self.output_tensor_karg_to_shape_access.get_or_create(
pair[1], get_creator
)
statement = f"{out_obj_name}.push_back({pair[1]});"
out_tensor_id = out_tensor_id + 1
return statement

return generate_statement() if is_out_tensor_type else ""

out_str_list = map(
declare_out_params_epilogue_arguments_assign,
generated_kernel_arg_id_and_names,
)
str_list = filter(
lambda ss: ss != "",
[*in_str_list, *out_str_list]
)
return f"\n{indent}".join(str_list)

def get_params_input_shape_init_str(self, input_name, input_shape_kargs, indent):
def init_input_shape_with_args(i):
def get_creator():
Expand Down Expand Up @@ -264,9 +308,9 @@ def make_project(
// Note: need to support vectorized operation
__forceinline__ __host__ __device__
T operator()(T x, const Arguments& args, const MatrixCoord& coord) const {
T out;
AP_OUTPUTS_INIT

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

感觉应该命名成$AP_OUTPUTS_INIT 与普通的c++宏分开。

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

原来的代码模板中对要替换的对象命名没有制定规范,可以制定一个,然后按规范来

AP_GENERATED_BINARY_EPILOGUE_STRING
return out;
return out0;

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里生成的代码是什么样子,在PR描述里面贴一个例子吧

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

}
};

Expand Down Expand Up @@ -303,9 +347,10 @@ def make_project(
*cuda_stream_ptr, ${input0}, ${input1}, nullptr, ${output}, ${input0}_shape, ${input1}_shape, std::vector<int64_t>{});

std::vector<const void *> epilogue_in_ptrs;
std::vector<void *> epilogue_out_ptrs;
AP_PARAMS_EPILOGUE_PTRS_INIT

params.SetEpilogues(epilogue_in_ptrs);
params.SetEpilogues(epilogue_in_ptrs, epilogue_out_ptrs);

#if AP_ENABLE_AUTOTUNE
AP_AUTOTUNE_${output_dtype}(ap::RunMatmulWithVariadicKernel);
Expand All @@ -321,6 +366,7 @@ def make_project(
code_template.replace(
"AP_GENERATED_BINARY_EPILOGUE_STRING", trivial_code_str
)
.replace("AP_OUTPUTS_INIT", self.init_outputs())
.replace("AP_KERNEL_ARGS_DECLARE", self.get_kernel_arg_list_str())
.replace(
"AP_PARAMS_INPUT0_SHAPE_INIT",
Expand All @@ -336,7 +382,7 @@ def make_project(
)
.replace(
"AP_PARAMS_EPILOGUE_PTRS_INIT",
self.get_params_epilogue_ptrs_init_str("epilogue_in_ptrs", indent=" "),
self.get_params_epilogue_ptrs_init_str("epilogue_in_ptrs", "epilogue_out_ptrs", indent=" "),
)
.replace(
"AP_EPILOGUE_ARGUMENTS_FIELDS",
Expand All @@ -356,7 +402,7 @@ def make_project(
.replace("${k_value}", f"{input0_shape_kargs[-1].value}")
.replace("${n_value}", f"{input1_shape_kargs[-1].value}")
)

print('cuda code is: ', code)

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这地方的代码应该去掉

source_dir = "/work/abstract_pass/Athena/tests/ap/matmul"
cutlass_dir = "/work/abstract_pass/Athena/tests/ap/matmul/cutlass"
compile_cmd = (
Expand Down
Loading