support arbitrary input/ouput and non-3-dim mm_out#14
Conversation
| ir_value = getattr(self.tensor_match_ctx, in_ir_value_name) | ||
| print('ir_value: ', ir_value) | ||
| kernel_arg_id = self.code_gen_ctx.in_tensor_data_ptr_kernel_arg_id(ir_value) | ||
| print('kernel_arg_id: ', kernel_arg_id) |
| return name | ||
|
|
||
| def get_out_tensor_data_ptr_var_name(self, out_ir_value_name): | ||
| out_ir_value_name = out_ir_value_name.replace("out", "output") |
There was a problem hiding this comment.
out_{i} 是程序的局部变量, output_{i} 是context中注册的ir_name,此命名可以相互区分。采用replace可以使得,局部变量找到全局变量。
| TEST_TPL_FILENAME=`echo ${TEST_FILENAME/test_/}` | ||
|
|
||
| echo "-- Write 'import ${TEST_FILENAME}' to __main__.py" | ||
| echo "import ${TEST_FILENAME}" > __main__.py |
There was a problem hiding this comment.
L3 - L9删除吧,直接把所有需要生成json的文件加到FILENAMES_ARRAY里面,pattern文件加到__main__.py里面。
| AP_OUTPUTS_INIT | ||
| AP_GENERATED_BINARY_EPILOGUE_STRING | ||
| return out; | ||
| return out0; |
| code_template.replace( | ||
| "AP_GENERATED_BINARY_EPILOGUE_STRING", trivial_code_str | ||
| ) | ||
| .replace("AP_GENERATED_ELEMENT_DTYPE", output_dtype) |
| @@ -0,0 +1,158 @@ | |||
| import access_topo_drr | |||
There was a problem hiding this comment.
可以请 @lixinqi 命名一下吗,想了几个感觉都太长了比如:matmul_epilogue_simplify_homeomorphic_subgraph
| arg_name = mut_kernel_arg_id_registry.get_in_tensor_data_ptr_var_name(data_op_name) | ||
| print('arg_name is: ', arg_name) | ||
| ptr_var_name = self.kernel_arg_translator.get_use_name(arg_name) | ||
| print('ptr_var_name is: ', ptr_var_name) |
There was a problem hiding this comment.
该log在之后策略补充中很需要,重名为print('ptr_var of OpLoadFromGlobal is: ', ptr_var_name)
| import pir | ||
|
|
||
| @access_topo_drr.register_drr_pass("pd_op_static_relu", tag="umprime") | ||
| class PdOpCastAccessTopoPass(access_topo_drr.DrrPass): |
| return self.all_kernel_arg_id2unique_name.get_or_create(kernel_arg_id, create) | ||
|
|
||
| def get_in_tensor_data_ptr_var_name(self, in_ir_value_name): | ||
| print('in_ir_value_name: ', in_ir_value_name) |
| return name | ||
|
|
||
| def get_out_tensor_data_ptr_var_name(self, out_ir_value_name): | ||
| out_ir_value_name = out_ir_value_name.replace("out", "output") |
| def is_out_tensor_karg(kernel_arg_id): | ||
| kernel_arg_id_type_name = f"{type(kernel_arg_id)}".replace("<class '", "").replace( | ||
| "'>", "" | ||
| ) | ||
| return kernel_arg_id_type_name == "OutTensorDataPtrKernelArgId" |
There was a problem hiding this comment.
这里的代码质量不行,如果需要判断kernel_arg_id的类型,那就在c++代码里导出OutTensorDataPtrKernelArgId类变量到python层。
There was a problem hiding this comment.
这里的代码质量不行,如果需要判断kernel_arg_id的类型,那就在c++代码里导出OutTensorDataPtrKernelArgId类变量到python层。
我上个PR中对输入指针参数是这么判断的,我的锅,我来改下。
为啥要这个is_out_tensor_arg的逻辑?
因为之前为Autotune功能设计的ProfileBestConfig函数声明如下,只能接收void(const GemmEpilogueParams &)这种形式的函数。AP里面生成的Kernel函数参数列表是不固定的,需要将所有的维度、指针参数都先存到GemmEpilogueParams中,因此需要区分karg类型来保存。
static int ProfileBestConfig(
const std::vector<std::function<void(const GemmEpilogueParams &)>>
&gemm_functions,
const GemmEpilogueParams ¶ms);最近想到ProfileBestConfig即使不生成,应该也可以支持可变参数列表,后面有空了可以再来优化下。
|
|
||
| def get_out_tensor_statement(): | ||
| param_name_for_var = self.output_tensor_karg_to_shape_access[var_name] | ||
| return f"reinterpret_cast<{output_dtype} *>({params_name}.{param_name_for_var})" |
There was a problem hiding this comment.
这里只是在Host端赋值一次,应该不会有太大开销,对应PR描述示例代码里面的如下部分:
epilogue_args.in_ptr_0 = reinterpret_cast<const float *>(params.epilogue_in_ptrs[0]);
epilogue_args.out_ptr_0 = reinterpret_cast<float *>(params.epilogue_out_ptrs[0]);| __forceinline__ __host__ __device__ | ||
| T operator()(T x, const Arguments& args, const MatrixCoord& coord) const { | ||
| T out; | ||
| AP_OUTPUTS_INIT |
There was a problem hiding this comment.
感觉应该命名成$AP_OUTPUTS_INIT 与普通的c++宏分开。
| .replace("${n_value}", f"{input1_shape_kargs[-1].value}") | ||
| ) | ||
|
|
||
| print('cuda code is: ', code) |
| @@ -0,0 +1,158 @@ | |||
| import access_topo_drr | |||
| @@ -0,0 +1,158 @@ | |||
| import access_topo_drr | |||
| mut_lir_code_gen_ctx=mut_lir_code_gen_ctx, | ||
| ) | ||
| data_op_name = inputs[0].var_name | ||
| print('data_name of OpLoadFromGlobal is: ', data_op_name) |
| def __call__(self, inputs, mut_kernel_arg_id_registry, mut_lir_code_gen_ctx): | ||
| mut_lir_code_gen_ctx.stmts.append(f"{self.get_out_var_name()} = {inputs[0].var_name};") | ||
| out_name = self.get_out_var_name() | ||
| mut_kernel_arg_id_registry.get_out_tensor_data_ptr_var_name(out_name) if out_name != "out0" else None |
There was a problem hiding this comment.
为什么总是需要特判呢?"out0"这个字面量很重要吗?为什么一定这样
| def __call__(self, inputs, mut_kernel_arg_id_registry, mut_lir_code_gen_ctx): | ||
| return inputs | ||
|
|
||
| class CinnOpExpandCodeGen: |
变更:
cuda 示例