diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 327f715c9b..f2dc43273f 100755 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -90,8 +90,8 @@ repos: rev: v0.6.13 hooks: - id: cmake-format -- repo: https://github.com/cmake-lint/cmake-lint - rev: 1.4.2 +- repo: https://github.com/PFCCLab/cmake-lint-paddle + rev: v1.5.1 hooks: - id: cmakelint args: [--config=./tools/codestyle/.cmakelintrc] diff --git a/backends/custom_cpu/tests/unittests/test_argsort_op.py b/backends/custom_cpu/tests/unittests/test_argsort_op.py index e84454fa3e..46e0a391c5 100644 --- a/backends/custom_cpu/tests/unittests/test_argsort_op.py +++ b/backends/custom_cpu/tests/unittests/test_argsort_op.py @@ -70,7 +70,7 @@ def forward(self): def create_tensor(np_data, place): - tensor = core.LoDTensor() + tensor = core.DenseTensor() tensor.set(np_data, place) return tensor diff --git a/backends/custom_cpu/tests/unittests/test_slice_op.py b/backends/custom_cpu/tests/unittests/test_slice_op.py index 59f5212e4a..e170baaa73 100644 --- a/backends/custom_cpu/tests/unittests/test_slice_op.py +++ b/backends/custom_cpu/tests/unittests/test_slice_op.py @@ -582,7 +582,7 @@ def test_case_1(self): main_program = base.Program() self.set_program_and_run(main_program, 1) - self.assertTrue(self.sliced_arr.type == core.VarDesc.VarType.LOD_TENSOR) + self.assertTrue(self.sliced_arr.type == core.VarDesc.VarType.DENSE_TENSOR) self.assertEqual(self.sliced_arr.shape, self.shape) self.assertTrue(np.array_equal(self.out, self.data)) self.assertTrue(np.array_equal(self.g_x0, np.ones_like(self.data))) @@ -593,7 +593,7 @@ def test_case_2(self): main_program = base.Program() self.set_program_and_run(main_program, 2) - self.assertTrue(self.sliced_arr.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY) + self.assertTrue(self.sliced_arr.type == core.VarDesc.VarType.DENSE_TENSOR_ARRAY) self.assertEqual(self.sliced_arr.shape, self.shape) self.assertTrue( np.array_equal(self.out, np.stack([self.data, self.data], axis=self.axis)) @@ -606,7 +606,7 @@ def test_case_3(self): main_program = base.Program() self.set_program_and_run(main_program, 3) - self.assertTrue(self.sliced_arr.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY) + self.assertTrue(self.sliced_arr.type == core.VarDesc.VarType.DENSE_TENSOR_ARRAY) self.assertEqual(self.sliced_arr.shape, self.shape) self.assertTrue( np.array_equal( diff --git a/backends/gcu/tests/unittests_legacy/test_argsort_op_gcu.py b/backends/gcu/tests/unittests_legacy/test_argsort_op_gcu.py index 7d1c1af987..06f305c52e 100644 --- a/backends/gcu/tests/unittests_legacy/test_argsort_op_gcu.py +++ b/backends/gcu/tests/unittests_legacy/test_argsort_op_gcu.py @@ -181,7 +181,7 @@ def forward(self): def create_tensor(np_data, place): - tensor = core.LoDTensor() + tensor = core.DenseTensor() tensor.set(np_data, place) return tensor diff --git a/backends/intel_gpu/tests/unittests/test_argsort_op.py b/backends/intel_gpu/tests/unittests/test_argsort_op.py index 78b678e930..59a4c3d93d 100644 --- a/backends/intel_gpu/tests/unittests/test_argsort_op.py +++ b/backends/intel_gpu/tests/unittests/test_argsort_op.py @@ -69,7 +69,7 @@ def forward(self): def create_tensor(np_data, place): - tensor = core.LoDTensor() + tensor = core.DenseTensor() tensor.set(np_data, place) return tensor diff --git a/backends/intel_gpu/tests/unittests/test_slice_op.py b/backends/intel_gpu/tests/unittests/test_slice_op.py index a0b0b00080..0c9ed8ab2d 100644 --- a/backends/intel_gpu/tests/unittests/test_slice_op.py +++ b/backends/intel_gpu/tests/unittests/test_slice_op.py @@ -582,7 +582,7 @@ def test_slice_api(self): # main_program = base.Program() # self.set_program_and_run(main_program, 1) -# self.assertTrue(self.sliced_arr.type == core.VarDesc.VarType.LOD_TENSOR) +# self.assertTrue(self.sliced_arr.type == core.VarDesc.VarType.DENSE_TENSOR) # self.assertEqual(self.sliced_arr.shape, self.shape) # self.assertTrue(np.array_equal(self.out, self.data)) # self.assertTrue(np.array_equal(self.g_x0, np.ones_like(self.data))) @@ -594,7 +594,7 @@ def test_slice_api(self): # self.set_program_and_run(main_program, 2) # self.assertTrue( -# self.sliced_arr.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY) +# self.sliced_arr.type == core.VarDesc.VarType.DENSE_TENSOR_ARRAY) # self.assertEqual(self.sliced_arr.shape, self.shape) # self.assertTrue( # np.array_equal( @@ -609,7 +609,7 @@ def test_slice_api(self): # self.set_program_and_run(main_program, 3) # self.assertTrue( -# self.sliced_arr.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY) +# self.sliced_arr.type == core.VarDesc.VarType.DENSE_TENSOR_ARRAY) # self.assertEqual(self.sliced_arr.shape, self.shape) # self.assertTrue( # np.array_equal( diff --git a/backends/npu/tests/unittests/test_slice_op_npu.py b/backends/npu/tests/unittests/test_slice_op_npu.py index 3caae3b6b8..f25b13f86b 100644 --- a/backends/npu/tests/unittests/test_slice_op_npu.py +++ b/backends/npu/tests/unittests/test_slice_op_npu.py @@ -666,7 +666,7 @@ def test_case_1(self): main_program = base.Program() self.set_program_and_run(main_program, 1) - self.assertTrue(self.sliced_arr.type == core.VarDesc.VarType.LOD_TENSOR) + self.assertTrue(self.sliced_arr.type == core.VarDesc.VarType.DENSE_TENSOR) self.assertEqual(self.sliced_arr.shape, self.shape) np.testing.assert_array_equal(self.out, self.data) np.testing.assert_array_equal(self.g_x0, np.ones_like(self.data)) @@ -677,7 +677,7 @@ def test_case_2(self): main_program = base.Program() self.set_program_and_run(main_program, 2) - self.assertTrue(self.sliced_arr.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY) + self.assertTrue(self.sliced_arr.type == core.VarDesc.VarType.DENSE_TENSOR_ARRAY) self.assertEqual(self.sliced_arr.shape, self.shape) np.testing.assert_array_equal( self.out, np.stack([self.data, self.data], axis=self.axis) @@ -690,7 +690,7 @@ def test_case_3(self): main_program = base.Program() self.set_program_and_run(main_program, 3) - self.assertTrue(self.sliced_arr.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY) + self.assertTrue(self.sliced_arr.type == core.VarDesc.VarType.DENSE_TENSOR_ARRAY) self.assertEqual(self.sliced_arr.shape, self.shape) np.testing.assert_array_equal( self.out, diff --git a/backends/npu/tests/unittests/test_strided_slice_op_npu.py b/backends/npu/tests/unittests/test_strided_slice_op_npu.py index 85e996bf70..e2e1268a26 100755 --- a/backends/npu/tests/unittests/test_strided_slice_op_npu.py +++ b/backends/npu/tests/unittests/test_strided_slice_op_npu.py @@ -689,7 +689,7 @@ def test_dygraph_op(self): # numeric_output = [self.data, self.data] # self.assertTrue( -# self.sliced_arr.type == base.core.VarDesc.VarType.LOD_TENSOR_ARRAY +# self.sliced_arr.type == base.core.VarDesc.VarType.DENSE_TENSOR_ARRAY # ) # np.testing.assert_array_equal(self.out, numeric_output) # np.testing.assert_array_equal(self.g_x0, np.zeros_like(self.data)) diff --git a/python/tests/op_test.py b/python/tests/op_test.py deleted file mode 120000 index adcdd3a239..0000000000 --- a/python/tests/op_test.py +++ /dev/null @@ -1 +0,0 @@ -../../Paddle/test/legacy_test/op_test.py \ No newline at end of file diff --git a/python/tests/op_test.py b/python/tests/op_test.py new file mode 100644 index 0000000000..0a88a38ca0 --- /dev/null +++ b/python/tests/op_test.py @@ -0,0 +1,3890 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import inspect +import os +import pathlib +import random +import struct +import sys +import unittest +import warnings +from collections import defaultdict +from contextlib import contextmanager +from copy import copy + +import numpy as np +from auto_parallel_op_test import ( + dump_test_info, + gen_auto_parallel_test_file, + get_subprocess_command, + get_subprocess_runtime_envs, + get_test_info_and_generated_test_path, + is_ban_auto_parallel_test, + run_subprocess, +) +from op import Operator +from prim_op_test import OpTestUtils, PrimForwardChecker, PrimGradChecker +from testsuite import append_input_output, append_loss_ops, create_op, set_input + +# Add test/legacy and test to sys.path +legacy_test_dir = pathlib.Path(__file__).parent # test/legacy_test +test_dir = legacy_test_dir.parent # test +sys.path.append(str(legacy_test_dir.absolute())) +sys.path.append(str(test_dir.absolute())) + +from utils import pir_executor_guard, static_guard +from white_list import ( + check_shape_white_list, + compile_vs_runtime_white_list, + no_check_set_white_list, + no_grad_set_white_list, + op_accuracy_white_list, + op_threshold_white_list, +) + +import paddle +from paddle import base +from paddle.autograd.ir_backward import grad as ir_grad +from paddle.base import Scope, core, unique_name +from paddle.base.backward import append_backward +from paddle.base.executor import Executor, scope_guard +from paddle.base.framework import ( + OpProtoHolder, + Program, + _current_expected_place, + canonicalize_attrs, + get_flags, + set_flags, +) +from paddle.base.wrapped_decorator import signature_safe_contextmanager + +try: + DenseTensor = core.DenseTensor + DenseTensorArray = core.DenseTensorArray + DENSE_TENSOR = core.VarDesc.VarType.DENSE_TENSOR + DENSE_TENSOR_ARRAY = core.VarDesc.VarType.DENSE_TENSOR_ARRAY +except: + DenseTensor = core.LoDTensor + DenseTensorArray = core.LoDTensorArray + DENSE_TENSOR = core.VarDesc.VarType.LOD_TENSOR + DENSE_TENSOR_ARRAY = core.VarDesc.VarType.LOD_TENSOR_ARRAY + + +@signature_safe_contextmanager +def paddle_static_guard(): + try: + paddle.enable_static() + yield + finally: + paddle.disable_static() + + +def check_out_dtype(api_fn, in_specs, expect_dtypes, target_index=0, **configs): + """ + Determines whether dtype of output tensor is as expected. + + Args: + api_fn(callable): paddle api function + in_specs(list[tuple]): list of shape and dtype information for constructing input tensor of api_fn, such as [(shape, dtype), (shape, dtype)]. + expect_dtypes(list[str]): expected dtype of output tensor. + target_index(int): indicate which one from in_specs to infer the dtype of output. + config(dict): other arguments of paddle api function + + Example: + check_out_dtype(base.layers.pad_constant_like, [([2,3,2,3], 'float64'), ([1, 3, 1,3], )], ['float32', 'float64', 'int64'], target_index=1, pad_value=0.) + + """ + with paddle.pir_utils.OldIrGuard(): + for i, expect_dtype in enumerate(expect_dtypes): + with paddle.static.program_guard(paddle.static.Program()): + input_t = [] + for index, spec in enumerate(in_specs): + if len(spec) == 1: + shape = spec[0] + dtype = expect_dtype if target_index == index else "float32" + elif len(spec) == 2: + shape, dtype = spec + else: + raise ValueError( + f"Value of in_specs[{index}] should contains two elements: [shape, dtype]" + ) + input_t.append( + paddle.static.data( + name=f"data_{index}", shape=shape, dtype=dtype + ) + ) + + out = api_fn(*input_t, **configs) + out_dtype = base.data_feeder.convert_dtype(out.dtype) + + if out_dtype != expect_dtype: + raise ValueError( + f"Expected out.dtype is {expect_dtype}, but got {out_dtype} from {api_fn.__name__}." + ) + + +def _set_use_system_allocator(value=None): + USE_SYSTEM_ALLOCATOR_FLAG = "FLAGS_use_system_allocator" + old_value = core.globals()[USE_SYSTEM_ALLOCATOR_FLAG] + value = old_value if value is None else value + core.globals()[USE_SYSTEM_ALLOCATOR_FLAG] = value + return old_value + + +def randomize_probability(batch_size, class_num, dtype="float32"): + prob = np.random.uniform(0.1, 1.0, size=(batch_size, class_num)).astype(dtype) + prob_sum = prob.sum(axis=1) + for i in range(len(prob)): + prob[i] /= prob_sum[i] + return prob + + +def get_numeric_gradient( + place, + scope, + op, + inputs, + input_to_check, + output_names, + delta=0.005, + in_place=False, +): + # FIXME: change this method by compile time concepts + set_input(scope, op, inputs, place) + + def product(dim): + return functools.reduce(lambda a, b: a * b, dim, 1) + + tensor_to_check = scope.find_var(input_to_check).get_tensor() + tensor_size = product(tensor_to_check.shape()) + tensor_to_check_dtype = tensor_to_check._dtype() + if tensor_to_check_dtype == paddle.float32: + tensor_to_check_dtype = np.float32 + elif tensor_to_check_dtype == paddle.float64: + tensor_to_check_dtype = np.float64 + elif tensor_to_check_dtype == paddle.float16: + tensor_to_check_dtype = np.float16 + # set delta as np.float16, will automatic convert to float32, float64 + delta = np.array(delta).astype(np.float16) + elif tensor_to_check_dtype == paddle.bfloat16: + tensor_to_check_dtype = np.float32 + elif tensor_to_check_dtype == paddle.complex64: + tensor_to_check_dtype = np.complex64 + elif tensor_to_check_dtype == paddle.complex128: + tensor_to_check_dtype = np.complex128 + else: + raise ValueError( + "Not supported data type " + + str(tensor_to_check_dtype) + + ", tensor name : " + + str(input_to_check) + ) + + def get_output(): + sum = [] + op.run(scope, place) + for output_name in output_names: + output_numpy = np.array(scope.find_var(output_name).get_tensor()) + # numpy.dtype does not have bfloat16, thus we use numpy.uint16 to + # store bfloat16 data, and need to be converted to float to check + # the floating precision. + if tensor_to_check._dtype() == paddle.bfloat16: + output_numpy = convert_uint16_to_float(output_numpy) + sum.append(output_numpy.astype(tensor_to_check_dtype).mean()) + return tensor_to_check_dtype(np.array(sum).sum() / len(output_names)) + + gradient_flat = np.zeros(shape=(tensor_size,), dtype=tensor_to_check_dtype) + + def __get_elem__(tensor, i): + if tensor_to_check_dtype == np.float16: + numpy_tensor = np.array(tensor).astype(np.float16) + numpy_tensor = numpy_tensor.flatten() + return numpy_tensor[i] + elif tensor_to_check._dtype() == paddle.bfloat16: + numpy_tensor = np.array(tensor).astype(np.uint16) + numpy_tensor = numpy_tensor.flatten() + return struct.unpack( + "> 16 + + +def convert_float_to_uint16(float_list, data_format="NCHW"): + if data_format == "NHWC": + float_list = np.transpose(float_list, [0, 3, 1, 2]) + + new_output = [] + for x in np.nditer(float_list): + new_output.append(np.uint16(copy_bits_from_float_to_uint16(x))) + new_output = np.reshape(new_output, float_list.shape).view(np.uint16) + + if data_format == "NHWC": + new_output = np.transpose(new_output, [0, 2, 3, 1]) + return new_output + + +def convert_uint16_to_float(in_list): + in_list = np.asarray(in_list) + out = np.vectorize( + lambda x: struct.unpack(" 1 and is_np_data( + sub_val_value[1] + ): # case 3 + dtype_set.add(sub_val_value[1].dtype) + elif ( + len(sub_val_value) > 1 + and isinstance(sub_val_value[1], (list, tuple)) + and is_np_data(sub_val_value[1][0]) + ): # case 4 + dtype_set.add(sub_val_value[1][0].dtype) + + # infer dtype from inputs, and dtype means the precision of the test + # collect dtype of all inputs + input_dtype_set = set() + infer_dtype(inputs, input_dtype_set) + dtype_list = [ + np.dtype(np.complex128), + np.dtype(np.complex64), + np.dtype(np.float64), + np.dtype(np.float32), + np.dtype(np.float16), + np.dtype(np.int64), + np.dtype(np.int32), + np.dtype(np.uint16), + np.dtype(np.int16), + np.dtype(np.int8), + np.dtype(np.uint8), + np.dtype(np.bool_), + ] + # check the dtype in dtype_list in order, select the first dtype that in dtype_set + for dtype in dtype_list: + if dtype in input_dtype_set: + self.dtype = dtype + break + # save input dtype in class attr + self.__class__.dtype = self.dtype + + # infer dtype of outputs + output_dtype_set = set() + infer_dtype(outputs, output_dtype_set) + for dtype in dtype_list: + if dtype in output_dtype_set: + self.output_dtype = dtype + break + + def feed_var(self, input_vars, place): + feed_map = {} + for var_name in input_vars: + if isinstance(input_vars[var_name], list): + for name, np_value in self.inputs[var_name]: + tensor = DenseTensor() + if isinstance(np_value, tuple): + tensor.set(np_value[0], place) + dtype = np.array(np_value[1]).dtype + + if self.is_calc_ref: + # convert the float16 to float by numpy.astype + if dtype == np.float16: + if isinstance(np_value[1], list): + tensor.set_recursive_sequence_lengths( + np.array(np_value[1]).astype(np.float32) + ) + else: + tensor.set_recursive_sequence_lengths( + np_value[1].astype(np.float32) + ) + # convert the bfloat16 to float by convert_uint16_to_float + # provided in this file + elif dtype == np.uint16: + if isinstance(np_value[1], list): + tensor.set_recursive_sequence_lengths( + convert_uint16_to_float(np.array(np_value[1])) + ) + else: + tensor.set_recursive_sequence_lengths( + convert_uint16_to_float(np_value[1]) + ) + else: + tensor.set_recursive_sequence_lengths(np_value[1]) + else: + tensor.set_recursive_sequence_lengths(np_value[1]) + else: + if self.is_calc_ref: + if np_value.dtype == np.float16: + tensor.set(np_value.astype(np.float32), place) + elif np_value.dtype == np.uint16: + tensor.set(convert_uint16_to_float(np_value), place) + else: + tensor.set(np_value, place) + else: + tensor.set(np_value, place) + feed_map[name] = tensor + else: + tensor = DenseTensor() + if isinstance(self.inputs[var_name], tuple): + tensor.set(self.inputs[var_name][0], place) + if self.is_calc_ref: + if isinstance(self.inputs[var_name][1], list): + dtype = np.array(self.inputs[var_name][1]).dtype + if dtype == np.float16: + tensor.set_recursive_sequence_lengths( + np.array(self.inputs[var_name][1]).astype( + np.float32 + ) + ) + elif dtype == np.uint16: + tensor.set_recursive_sequence_lengths( + convert_uint16_to_float( + np.array(self.inputs[var_name][1]) + ) + ) + else: + tensor.set_recursive_sequence_lengths( + self.inputs[var_name][1] + ) + + elif self.inputs[var_name][1].dtype == np.float16: + tensor.set_recursive_sequence_lengths( + self.inputs[var_name][1].astype(np.float32) + ) + elif self.inputs[var_name][1].dtype == np.uint16: + tensor.set_recursive_sequence_lengths( + convert_uint16_to_float(self.inputs[var_name][1]) + ) + else: + tensor.set_recursive_sequence_lengths( + self.inputs[var_name][1] + ) + else: + tensor.set_recursive_sequence_lengths(self.inputs[var_name][1]) + else: + if self.is_calc_ref: + if self.inputs[var_name].dtype == np.float16: + tensor.set(self.inputs[var_name].astype(np.float32), place) + elif self.inputs[var_name].dtype == np.uint16: + tensor.set( + convert_uint16_to_float(self.inputs[var_name]), + place, + ) + else: + tensor.set(self.inputs[var_name], place) + else: + tensor.set(self.inputs[var_name], place) + feed_map[var_name] = tensor + + return feed_map + + def _append_ops(self, block): + self.__class__.op_type = ( + self.op_type + ) # for ci check, please not delete it for now + if self.is_mkldnn_op(): + self.__class__.use_mkldnn = True + + if self.is_xpu_op(): + self.__class__.use_xpu = True + + op_proto = OpProtoHolder.instance().get_op_proto(self.op_type) + # "infer datatype from inputs and outputs for this test case" + + if self.is_float16_op(): + self.dtype = np.float16 + self.__class__.dtype = self.dtype + self.output_dtype = np.float16 + elif self.is_bfloat16_op(): + self.dtype = np.uint16 + self.__class__.dtype = self.dtype + self.output_dtype = np.uint16 + else: + self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs) + + inputs = append_input_output( + block, op_proto, self.inputs, True, self.dtype, self.is_calc_ref + ) + outputs = append_input_output( + block, op_proto, self.outputs, False, self.dtype, self.is_calc_ref + ) + + if hasattr(self, "cache_name_list"): + for name in self.cache_name_list: + inputs[name] = block.create_var( + name=name, + persistable=True, + type=core.VarDesc.VarType.RAW, + stop_gradient=True, + ) + op = block.append_op( + type=self.op_type, + inputs=inputs, + outputs=outputs, + attrs=copy(self.attrs) if hasattr(self, "attrs") else {}, + ) + # infer variable type and infer shape in compile-time + op.desc.infer_var_type(block.desc) + op.desc.infer_shape(block.desc) + + return op + + def _get_io_vars(self, block, numpy_inputs): + inputs = {} + for name, value in numpy_inputs.items(): + if isinstance(value, list): + var_list = [block.var(sub_name) for sub_name, sub_value in value] + inputs[name] = var_list + else: + inputs[name] = block.var(name) + return inputs + + def _get_inputs(self, block): + return self._get_io_vars(block, self.inputs) + + def _get_outputs(self, block): + return self._get_io_vars(block, self.outputs) + + def calc_output(self, place): + outs, _ = self._calc_output(place) + return outs + + def _create_var_from_numpy(self, value): + if isinstance(value, tuple): + data = value[0] + lod = value[1] + v = paddle.to_tensor(data) + v.value().get_tensor().set_recursive_sequence_lengths(lod) + return v + else: + return paddle.to_tensor(value) + + def get_sequence_batch_size_1_input(self, lod=None, shape=None): + """Get LoD input data whose batch size is 1. + All sequence related OP unittests should call this function to contain the case of batch size = 1. + Args: + lod (list[list of int], optional): Length-based LoD, length of lod[0] should be 1. Default: [[13]]. + shape (list, optional): Shape of input, shape[0] should be equals to lod[0][0]. Default: [13, 23]. + Returns: + tuple (ndarray, lod) : LoD input data whose batch size is 1. + """ + if lod is None: + lod = [[13]] + if shape is None: + shape = [13, 23] + assert len(lod[0]) == 1 + assert lod[0][0] == shape[0] + x = np.random.uniform(0.1, 1, shape).astype("float32") + return (x, lod) + + def lod_has_single_zero(self, lod): + for i in range(len(lod) - 2): + if lod[i] != 0 and lod[i + 1] == 0 and lod[i + 2] != 0: + return True + return False + + def lod_has_continuous_zero(self, lod): + for i in range(len(lod) - 3): + if lod[i] != 0 and lod[i + 1] == 0 and lod[i + 2] == 0 and lod[i + 3] != 0: + return True + return False + + def get_sequence_instance_size_0_input(self, lod=None, shape=None): + """Get LoD input data whose instance size is 0. + All sequence related OP unittests should call this function to contain the case of instance size is 0. + Args: + lod (list[list of int], optional): Length-based LoD, lod[0]'s size must at least eight, lod[0] must at least two zeros at the beginning and at least two zeros at the end, the middle position of lod[0] contains a single zero and multiple zero. Default: [[0, 0, 4, 0, 3, 0, 0, 5, 0, 0]]. + shape (list, optional): Shape of input, shape[0] should be equals to lod[0][0]. Default: [13, 23]. + Returns: + tuple (ndarray, lod): LoD input data whose instance size is 0. + """ + if lod is None: + lod = [[0, 0, 4, 0, 3, 0, 0, 5, 0, 0]] + if shape is None: + shape = [12, 10] + assert len(lod[0]) >= 8 + assert lod[0][0] == 0 and lod[0][1] == 0 and lod[0][-1] == 0 and lod[0][-2] == 0 + assert self.lod_has_single_zero(lod[0]) is True + assert self.lod_has_continuous_zero(lod[0]) is True + assert sum(lod[0]) == shape[0] + + x = np.random.uniform(0.1, 1, shape).astype("float32") + return (x, lod) + + def append_input_output_for_dygraph( + self, op_proto, np_list, is_input, if_return_inputs_grad_dict, block + ): + def create_var( + np_value, + name, + is_input, + if_return_inputs_grad_dict, + is_calc_ref=False, + ): + np_value_temp = np_value + has_lod = False + lod_temp = None + if isinstance(np_value, tuple): + np_value_temp = np_value[0] + has_lod = True + lod_temp = np_value[1] + + if is_input: + if self.is_calc_ref and np_value_temp.dtype == np.float16: + v = self._create_var_from_numpy(np_value_temp.astype(np.float32)) + else: + v = self._create_var_from_numpy(np_value_temp) + + if if_return_inputs_grad_dict: + v.stop_gradient = False + v.retain_grads() + + if has_lod: + v.value().get_tensor().set_recursive_sequence_lengths(lod_temp) + else: + if self.is_calc_ref and np_value_temp.dtype == np.float16: + v = block.create_var( + name=name, + dtype=np.float32, + type=DENSE_TENSOR, + persistable=False, + stop_gradient=False, + ) + else: + v = block.create_var( + name=name, + dtype=np_value_temp.dtype, + type=DENSE_TENSOR, + persistable=False, + stop_gradient=False, + ) + return v + + # prepare variable for input or output + var_dict = defaultdict(list) + if if_return_inputs_grad_dict: + inputs_grad_dict = defaultdict() + proto_list = op_proto.inputs if is_input else op_proto.outputs + for var_proto in proto_list: + name = var_proto.name + if (name not in np_list) and var_proto.dispensable: + continue + if name not in np_list: + assert var_proto.intermediate, f"{name} not found" + v = block.create_var(dtype="float32", type=DENSE_TENSOR) + var_dict[name].append(v) + if if_return_inputs_grad_dict: + inputs_grad_dict[name] = v + continue + if var_proto.duplicable: + assert isinstance( + np_list[name], list + ), f"Duplicable {name} should be set as list" + var_list = [] + slot_name = name + for name, np_value in np_list[slot_name]: + v = create_var( + np_value, + name, + is_input, + if_return_inputs_grad_dict, + self.is_calc_ref, + ) + var_list.append(v) + if if_return_inputs_grad_dict: + inputs_grad_dict[name] = v + var_dict[slot_name] = var_list + else: + nplist_value_temp = None + name_temp = None + if isinstance(np_list[name], list): + nplist_value_temp = np_list[name][0] + name_temp = name + else: + nplist_value_temp = np_list[name] + name_temp = unique_name.generate(f"{name}_out") + v = create_var( + nplist_value_temp, + name_temp, + is_input, + if_return_inputs_grad_dict, + self.is_calc_ref, + ) + var_dict[name].append(v) + if if_return_inputs_grad_dict: + inputs_grad_dict[name] = v + + if if_return_inputs_grad_dict: + return var_dict, inputs_grad_dict + else: + return var_dict + + def _check_api_outs_by_dygraph_outs(self, api_outs, dygraph_outs, place): + """for quick verify, here we take a simplest strategy: + 1. we only check variable in api_outs. + 2. we simply check the numpy (tensor) . + 3. we set atol and rtol as 1e-5, because they are unrelated to dtype. + """ + for name in api_outs: + np_api = np.array(api_outs[name]) + np_dyg = np.array(dygraph_outs[name]) + assert ( + np_api.shape == np_dyg.shape + ), f"Operator ({self.op_type}) : Output ({name}) shape mismatch, expect shape is {np_dyg.shape}, but actual shape is {np_api.shape}" + np.testing.assert_allclose( + np_api, + np_dyg, + rtol=1e-05, + equal_nan=False, + err_msg="Operator (" + + self.op_type + + ") Output (" + + name + + ") has diff at " + + str(place) + + "\nExpect " + + str(np_dyg) + + "\n" + + "But Got" + + str(np_api) + + " in class " + + self.__class__.__name__, + ) + + def _calc_python_api_output(self, place, egr_inps=None, egr_oups=None): + """set egr_inps and egr_oups = None if you want to create it by yourself.""" + + def construct_output_dict_by_kernel_sig(ret_tuple, output_sig): + if hasattr(self, "python_out_sig"): + output_sig = self.python_out_sig + if not isinstance(ret_tuple, (tuple, list)): + ret_tuple = [ret_tuple] + if len(output_sig) == len(ret_tuple): + # [assumption]: we assume {"Out": [Tensor]} + return {a: [b] for a, b in zip(output_sig, ret_tuple)} + else: + # [assumption]: return multi-Tensor in a single output. such as paddle.split() + assert ( + len(output_sig) == 1 + ), "Don't support multi-output with multi-tensor output. (May be you can use set `python_out_sig`, see `test_squeeze2_op` as a example.)" + return {output_sig[0]: ret_tuple} + + def cal_python_api(python_api, args, kernel_sig): + inputs_sig, attrs_sig, outputs_sig = kernel_sig + args = OpTestUtils.assumption_assert_and_transform(args, len(inputs_sig)) + ret_tuple = python_api(*args) + result = construct_output_dict_by_kernel_sig(ret_tuple, outputs_sig) + if hasattr(self, "python_out_sig_sub_name"): + for key in self.python_out_sig_sub_name.keys(): + for i in range(len(self.python_out_sig_sub_name[key])): + result[key][0][i].name = self.python_out_sig_sub_name[key][i] + return result + + with base.dygraph.base.guard(place=place): + block = base.framework.default_main_program().global_block() + op_proto = OpProtoHolder.instance().get_op_proto(self.op_type) + # prepare input variable + dygraph_tensor_inputs = ( + egr_inps + if egr_inps + else self.append_input_output_for_dygraph( + op_proto, self.inputs, True, False, block + ) + ) + # prepare output variable + dygraph_tensor_outputs = ( + egr_oups + if egr_oups + else self.append_input_output_for_dygraph( + op_proto, self.outputs, False, False, block + ) + ) + + # prepare attributes + attrs_outputs = {} + if hasattr(self, "attrs"): + for attrs_name in self.attrs: + if self.attrs[attrs_name] is not None: + attrs_outputs[attrs_name] = self.attrs[attrs_name] + + kernel_sig = OpTestUtils._get_kernel_signature( + self.op_type, + dygraph_tensor_inputs, + dygraph_tensor_outputs, + canonicalize_attrs(attrs_outputs, op_proto), + ) + if not kernel_sig or ( + len(kernel_sig[0]) == 0 + and len(kernel_sig[1]) == 0 + and len(kernel_sig[2]) == 0 + ): + return None + if not hasattr(self, "python_api"): + print(kernel_sig) + assert hasattr( + self, "python_api" + ), f"Detect there is KernelSignature for `{self.op_type}` op, please set the `self.python_api` if you set check_dygraph = True" + args = OpTestUtils.prepare_python_api_arguments( + self.python_api, + dygraph_tensor_inputs, + attrs_outputs, + kernel_sig, + target_dtype=paddle.core.VarDesc.VarType, + ) + """ we directly return the cal_python_api value because the value is already tensor. + """ + return cal_python_api(self.python_api, args, kernel_sig) + + def _calc_dygraph_output( + self, + place, + parallel=False, + no_check_set=None, + egr_inps=None, + egr_oups=None, + ): + self.__class__.op_type = ( + self.op_type + ) # for ci check, please not delete it for now + with base.dygraph.base.guard(place=place): + block = base.framework.default_main_program().global_block() + + op_proto = OpProtoHolder.instance().get_op_proto(self.op_type) + + # prepare input variable + inputs = ( + egr_inps + if egr_inps + else self.append_input_output_for_dygraph( + op_proto, self.inputs, True, False, block + ) + ) + # prepare output variable + outputs = ( + egr_oups + if egr_oups + else self.append_input_output_for_dygraph( + op_proto, self.outputs, False, False, block + ) + ) + + # prepare attributes + attrs_outputs = {} + if hasattr(self, "attrs"): + for attrs_name in self.attrs: + if self.attrs[attrs_name] is not None: + attrs_outputs[attrs_name] = self.attrs[attrs_name] + + block.append_op( + type=self.op_type, + inputs=inputs, + outputs=outputs, + attrs=attrs_outputs if hasattr(self, "attrs") else None, + ) + return outputs + + def get_kernel_signature(self, place, egr_inps=None, egr_oups=None): + with base.dygraph.base.guard(place=place): + block = base.framework.default_main_program().global_block() + op_proto = OpProtoHolder.instance().get_op_proto(self.op_type) + # prepare input variable + dygraph_tensor_inputs = ( + egr_inps + if egr_inps + else self.append_input_output_for_dygraph( + op_proto, self.inputs, True, False, block + ) + ) + # prepare output variable + dygraph_tensor_outputs = ( + egr_oups + if egr_oups + else self.append_input_output_for_dygraph( + op_proto, self.outputs, False, False, block + ) + ) + + # prepare attributes + attrs_outputs = {} + if hasattr(self, "attrs"): + for attrs_name in self.attrs: + if self.attrs[attrs_name] is not None: + attrs_outputs[attrs_name] = self.attrs[attrs_name] + kernel_sig = OpTestUtils._get_kernel_signature( + self.op_type, + dygraph_tensor_inputs, + dygraph_tensor_outputs, + canonicalize_attrs(attrs_outputs, op_proto), + ) + if not kernel_sig or ( + len(kernel_sig[0]) == 0 + and len(kernel_sig[1]) == 0 + and len(kernel_sig[2]) == 0 + ): + return None + if not hasattr(self, "python_api"): + print(kernel_sig) + assert hasattr( + self, "python_api" + ), f"Detect there is KernelSignature for `{self.op_type}` op, please set the `self.python_api` if you set check_dygraph = True" + return kernel_sig + + def get_ir_input_attr_dict_and_feed(self, stop_gradient): + attrs_outputs = {} + if hasattr(self, "attrs"): + for attrs_name in self.attrs: + if self.attrs[attrs_name] is not None: + attrs_outputs[attrs_name] = self.attrs[attrs_name] + input_dict = {} + static_inputs = defaultdict(list) + feed = {} + for name, item in self.inputs.items(): + if isinstance(item, (list, tuple)): + for tup in item: + dtype = ( + "bfloat16" + if OpTestUtils.is_bfloat16_type(tup[1].dtype) + else tup[1].dtype + ) + x = paddle.static.data( + name=str(tup[0]), shape=tup[1].shape, dtype=dtype + ) + x.stop_gradient = stop_gradient + static_inputs[name].append(x) + feed.update({str(tup[0]): tup[1]}) + input_dict.update({str(tup[0]): x}) + else: + dtype = ( + "bfloat16" + if OpTestUtils.is_bfloat16_type(item.dtype) + else item.dtype + ) + x = paddle.static.data(name=name, shape=item.shape, dtype=dtype) + x.stop_gradient = stop_gradient + static_inputs[name].append(x) + feed.update({name: item}) + input_dict.update({name: x}) + return static_inputs, attrs_outputs, input_dict, feed + + def _need_fetch(self, sig_name): + if sig_name in self.outputs: + return True + for _, value in self.outputs.items(): + if not isinstance(value, (tuple, list)): + continue + for var_name, _ in value: + if sig_name == var_name: + return True + return False + + def _calc_pir_output(self, place, no_check_set=None, inps=None, oups=None): + """set egr_inps and egr_oups = None if you want to create it by yourself.""" + + def construct_output_dict_by_kernel_sig(ret_tuple, output_sig): + if hasattr(self, "python_out_sig"): + output_sig = self.python_out_sig + if not isinstance(ret_tuple, (tuple, list)): + ret_tuple = [ret_tuple] + if len(output_sig) == len(ret_tuple): + # [assumption]: we assume {"Out": [Tensor]} + return {a: [b] for a, b in zip(output_sig, ret_tuple)} + else: + # [assumption]: return multi-Tensor in a single output. such as paddle.split() + assert ( + len(output_sig) == 1 + ), "Don't support multi-output with multi-tensor output. (May be you can use set `python_out_sig`, see `test_squeeze2_op` as a example.)" + return {output_sig[0]: ret_tuple} + + # get kernel signature + kernel_sig = self.get_kernel_signature(place) + ir_program = paddle.static.Program() + with paddle.static.program_guard(ir_program): + with scope_guard(Scope()): + # prepare inps attributes feed + ( + static_inputs, + attrs, + input_dict, + feed, + ) = self.get_ir_input_attr_dict_and_feed(stop_gradient=True) + # prepare args + args = OpTestUtils.prepare_python_api_arguments( + self.python_api, + static_inputs, + attrs, + kernel_sig, + target_dtype=paddle.pir.core.DataType, + ) + inputs_sig, attrs_sig, outputs_sig = kernel_sig + if hasattr(self, "python_out_sig"): + outputs_sig = self.python_out_sig + args = OpTestUtils.assumption_assert_and_transform( + args, len(inputs_sig) + ) + ret_tuple = self.python_api(*args) + fetch_list = getattr(self, "fetch_list", []) + # if the fetch_list is customized by user, we use it directly. + # if not, fill the fetch_list by the user configured outputs in test. + # filter ret_tuple + ret_to_check = [] + if len(fetch_list) == 0: + if isinstance(ret_tuple, (tuple, list)): + assert len(ret_tuple) == len(outputs_sig) + for var, sig_name in zip(ret_tuple, outputs_sig): + if no_check_set is not None and var in no_check_set: + continue + if not self._need_fetch(sig_name): + continue + if isinstance(var, list): + ret_to_check.append(var) + for v in var: + fetch_list.append(v) + else: + ret_to_check.append(var) + fetch_list.append(var) + elif isinstance(ret_tuple, paddle.base.libpaddle.pir.Value): + fetch_list.append(ret_tuple) + ret_to_check = ret_tuple + elif ret_tuple is None: + pass + else: + raise ValueError( + "output of python api should be Value or list of Value or tuple of Value" + ) + + # executor run + executor = Executor(place) + outs = executor.run(ir_program, feed=feed, fetch_list=[fetch_list]) + outputs_sig = [ + sig_name for sig_name in outputs_sig if self._need_fetch(sig_name) + ] + + if paddle.utils.is_sequence(ret_to_check) and paddle.utils.is_sequence( + outs + ): + outs = paddle.utils.pack_sequence_as(ret_to_check, outs) + + result = construct_output_dict_by_kernel_sig(outs, outputs_sig) + if hasattr(self, "python_out_sig_sub_name"): + for key in self.python_out_sig_sub_name.keys(): + result[key][0] = { + a: [b] + for a, b in zip( + self.python_out_sig_sub_name[key], + result[key][0], + ) + } + return result + + def _check_ir_output(self, place, program, feed_map, fetch_list, outs): + if os.getenv("FLAGS_PIR_OPTEST") is None: + return + if os.getenv("FLAGS_PIR_OPTEST_WHITE_LIST") is None: + return + if self.check_prim or self.check_prim_pir: + return + if self._check_cinn: + return + stored_flag = get_flags( + [ + "FLAGS_enable_pir_in_executor", + "FLAGS_pir_apply_inplace_pass", + ] + ) + try: + set_flags( + { + "FLAGS_enable_pir_in_executor": True, + "FLAGS_pir_apply_inplace_pass": 0, + } + ) + new_scope = paddle.static.Scope() + executor = Executor(place) + new_program = None + if isinstance(program, paddle.static.CompiledProgram): + new_program = base.CompiledProgram( + program._program, build_strategy=program._build_strategy + ) + else: + new_program = program.clone() + ir_outs = executor.run( + new_program, + feed=feed_map, + fetch_list=fetch_list, + return_numpy=False, + scope=new_scope, + ) + assert len(outs) == len( + ir_outs + ), "Fetch result should have same length when executed in pir" + + check_method = np.testing.assert_array_equal + if os.getenv("FLAGS_PIR_OPTEST_RELAX_CHECK", None) == "True": + + def relaxed_check(x, y, err_msg=""): + np.testing.assert_allclose( + x, y, err_msg=err_msg, atol=1e-6, rtol=1e-6 + ) + + check_method = relaxed_check + if os.getenv("FLAGS_PIR_NO_CHECK", None) == "True": + check_method = lambda x, y, err_msg: None + + for i in range(len(outs)): + check_method( + outs[i], + ir_outs[i], + err_msg="Operator Check (" + + self.op_type + + ") has diff at " + + str(place) + + "\nExpect " + + str(outs[i]) + + "\n" + + "But Got" + + str(ir_outs[i]) + + " in class " + + self.__class__.__name__, + ) + finally: + set_flags(stored_flag) + + def _calc_output( + self, + place, + parallel=False, + no_check_set=None, + loss=None, + enable_inplace=None, + for_inplace_test=None, + check_cinn=False, + ): + with paddle.pir_utils.OldIrGuard(): + if hasattr(self, "attrs"): + for k, v in self.attrs.items(): + if isinstance(v, paddle.base.core.DataType): + self.attrs[k] = paddle.pir.core.datatype_to_vartype[v] + program = Program() + block = program.global_block() + op = self._append_ops(block) + + inputs = self._get_inputs(block) + outputs = self._get_outputs(block) + feed_map = self.feed_var(inputs, place) + + if for_inplace_test: + # Some variables' tensors hold no buffer (tensor's _holder is NULL), like XShape in reshape2 op, + # and the shapes of those variables contain 0 (eg. Xshape.shape = [0, 2, 5]). + # Set persistable for those variables in order to get them from global_scope for inplace grad test directly other than feed them, + # since feed op calls check_memory_size() which fails when tensor's holder_ is NULL. + for out_name in op.output_arg_names: + var = block.var(out_name) + if 0 in var.shape: + var.persistable = True + original_program = program + if parallel: + use_cuda = False + if isinstance(place, base.CUDAPlace): + use_cuda = True + compiled_prog = base.CompiledProgram(program) + program = compiled_prog + fetch_list = getattr(self, "fetch_list", []) + # if the fetch_list is customized by user, we use it directly. + # if not, fill the fetch_list by the user configured outputs in test. + if len(fetch_list) == 0: + for var_name, var in outputs.items(): + if no_check_set is not None and var_name in no_check_set: + continue + if isinstance(var, list): + for v in var: + fetch_list.append(v.name) + else: + fetch_list.append(var.name) + # if the fetch_list still empty, fill the fetch_list by the operator output. + if len(fetch_list) == 0: + for out_name, out_dup in Operator.get_op_outputs(self.op_type): + fetch_list.append(str(out_name)) + + enable_cinn_test = check_cinn and self._enable_check_cinn_test( + place, feed_map, outputs + ) + if enable_cinn_test: + if hasattr(self, "cinn_atol"): + self.atol = self.cinn_atol + if hasattr(self, "cinn_rtol"): + self.rtol = self.cinn_rtol + + if (enable_inplace is not None) or enable_cinn_test: + build_strategy = base.BuildStrategy() + if enable_inplace is not None: + build_strategy.enable_inplace = enable_inplace + if enable_cinn_test: + build_strategy.build_cinn_pass = check_cinn + self._check_cinn = enable_cinn_test + + compiled_prog = base.CompiledProgram( + program, build_strategy=build_strategy + ) + program = compiled_prog + + executor = Executor(place) + + outs = executor.run( + program, + feed=feed_map, + fetch_list=fetch_list, + return_numpy=False, + ) + + self._check_ir_output(place, program, feed_map, fetch_list, outs) + + self.op = op + self.program = original_program + if for_inplace_test: + return outs, fetch_list, feed_map, original_program, op.desc + else: + return outs, fetch_list + + def _compare_symbol(self, program, outs): + i = 0 + # check that all ops have defined the InferSymbolicShapeInterface + if paddle.base.libpaddle.pir.all_ops_defined_symbol_infer(program): + # compare expect & actual + shape_analysis = paddle.base.libpaddle.pir.get_shape_constraint_ir_analysis( + program + ) + for block in program.blocks: + for op in block.ops: + if op.name() == "pd_op.fetch": + for j, var in enumerate(op.results()): + if var.is_dense_tensor_type() or var.is_selected_row_type(): + shape_or_data = ( + shape_analysis.get_shape_or_data_for_var(var) + ) + expect_shape = outs[i].shape + i += 1 + expect_data = [] + if not shape_or_data.is_equal( + expect_shape, expect_data + ): + raise AssertionError( + f"The shape or data of Operator {self.op_type}'s result_value[{j}] is different from expected." + ) + else: + # TODO(gongshaotian): raise error + pass + + def _infer_and_compare_symbol(self, place): + """Don't caculate the program, only infer the shape of var""" + + kernel_sig = self.get_kernel_signature(place) + program = paddle.static.Program() + with paddle.static.program_guard(program): + with scope_guard(Scope()): + # prepare inps attributes feed + ( + static_inputs, + attrs, + input_dict, + feed, + ) = self.get_ir_input_attr_dict_and_feed(stop_gradient=True) + # prepare args + args = OpTestUtils.prepare_python_api_arguments( + self.python_api, + static_inputs, + attrs, + kernel_sig, + target_dtype=paddle.pir.core.DataType, + ) + inputs_sig, attrs_sig, outputs_sig = kernel_sig + if hasattr(self, "python_out_sig"): + outputs_sig = self.python_out_sig + args = OpTestUtils.assumption_assert_and_transform( + args, len(inputs_sig) + ) + # add op to program + ret_tuple = self.python_api(*args) + fetch_list = getattr(self, "fetch_list", []) + # if the fetch_list is customized by user, we use it directly. + # if not, fill the fetch_list by the user configured outputs in test. + # filter ret_tuple + ret_to_check = [] + if len(fetch_list) == 0: + if isinstance(ret_tuple, (tuple, list)): + assert len(ret_tuple) == len(outputs_sig) + for var, sig_name in zip(ret_tuple, outputs_sig): + if not self._need_fetch(sig_name): + continue + if isinstance(var, list): + ret_to_check.append(var) + for v in var: + fetch_list.append(v) + else: + ret_to_check.append(var) + fetch_list.append(var) + elif isinstance(ret_tuple, paddle.base.libpaddle.pir.Value): + fetch_list.append(ret_tuple) + ret_to_check = ret_tuple + elif ret_tuple is None: + pass + else: + raise ValueError( + "output of python api should be Value or list of Value or tuple of Value" + ) + + # executor run + executor = Executor(place) + outs = executor.run(program, feed=feed, fetch_list=[fetch_list]) + + self._compare_symbol(program, outs) + + def _compare_expect_and_actual_outputs( + self, place, fetch_list, expect_outs, actual_outs, inplace_atol=None + ): + """Compare expect outs and actual outs of an tested op. + + Args: + place (CPUPlace | CUDAPlace): The place where the op runs. + fetch_list (list): The outputs of tested op. + expect_outs (list): The expect outs of tested op. + actual_outs (list): The actual outs of tested op. + inplace_atol (float): The tolerable error, only set when tested op doesn't ensure computational consistency, like group_norm op. + + Returns: + None. + """ + # compare expect_outs and actual_outs + for i, name in enumerate(fetch_list): + # Note(zhiqiu): inplace_atol should be only set when op doesn't ensure + # computational consistency. + # When inplace_atol is not None, the inplace check uses numpy.allclose + # to check inplace result instead of numpy.array_equal. + expect_out = np.array(expect_outs[i]) + actual_out = np.array(actual_outs[i]) + assert ( + actual_out.shape == expect_out.shape + ), f"Operator ({self.op_type}) : Output ({name}) shape mismatch, expect shape is {expect_out.shape}, but actual shape is {actual_out.shape}" + if inplace_atol is not None: + np.testing.assert_allclose( + expect_out, + actual_out, + rtol=1e-03 if self.dtype == np.uint16 else 1e-5, + atol=inplace_atol, + err_msg="Operator (" + + self.op_type + + ") Output (" + + name + + ") has diff at " + + str(place) + + " when using and not using inplace" + + "\nExpect " + + str(expect_out) + + "\n" + + "But Got" + + str(actual_out) + + " in class " + + self.__class__.__name__, + ) + else: + np.testing.assert_array_equal( + expect_out, + actual_out, + err_msg="Output (" + + name + + ") has diff at " + + str(place) + + " when using and not using inplace" + + "\nExpect " + + str(expect_out) + + "\n" + + "But Got" + + str(actual_out) + + " in class " + + self.__class__.__name__ + + "\n", + ) + + def _construct_grad_program_from_forward( + self, fwd_program, grad_op_desc, op_grad_to_var + ): + """Generate grad_program which contains the grad_op. + + Args: + fwd_program (tuple): The program that contains grad_op_desc's corresponding forward op. + grad_op_desc (OpDesc): The OpDesc of grad op. + op_grad_to_var (dict): The relation of variables in grad op and its forward op. + + Returns: + grad_program (program): The program which contains the grad_op. + """ + with paddle.pir_utils.OldIrGuard(): + grad_program = Program() + grad_block = grad_program.global_block() + new_op_desc = grad_block.desc.append_op() + new_op_desc.copy_from(grad_op_desc) + grad_program._sync_with_cpp() + + # Create grad vars based on fwd vars (shape and dtype) + for arg in grad_op_desc.input_arg_names() + grad_op_desc.output_arg_names(): + fwd_var_name = op_grad_to_var.get(arg, None) + if fwd_var_name is None: + fwd_var_name = arg + fwd_var = fwd_program.global_block().vars.get(fwd_var_name) + assert fwd_var is not None, f"{fwd_var_name} cannot be found" + grad_var = grad_block.create_var( + name=arg, + dtype=fwd_var.dtype, + shape=fwd_var.shape, + type=fwd_var.type, + persistable=False, + ) + + # Some variables' tensors hold no buffer (tensor's _holder is NULL), like XShape in reshape2 op, + # and the shapes of those variables contain 0 (eg. Xshape.shape = [0, 2, 5]). + # Set persistable for those variables in order to get them from global_scope for inplace grad test directly other than feed them, + # since feed op calls check_memory_size() which fails when tensor's holder_ is NULL. + if 0 in grad_var.shape: + grad_var.persistable = True + grad_program._sync_with_cpp() + return grad_program + + def _construct_grad_feed_map_from_forward( + self, place, fwd_res, grad_op_desc, op_grad_to_var + ): + """Generate grad_feed_map for grad_program. + + since we don`t really check gradient accuracy, but check the consistency when using and not using inplace, + we use fwd outs (also inputs sometimes) to construct grad inputs. + + Args: + place (CPUPlace | CUDAPlace): The place where the op runs. + fwd_res (tuple): The outputs of its forward op, in the same form as returns of _calc_outputs() when for_inplace_test is True. + i.e., tuple(fwd_outs, fwd_fetch_list, fwd_feed_map, fwd_program, fwd_op_desc) + grad_op_desc (OpDesc): The OpDesc of grad op. + op_grad_to_var (dict): The relation of variables in grad op and its fwd_op. + + Returns: + grad_feed_map (dict): The feed_map of grad_op. + """ + ( + fwd_outs, + fwd_fetch_list, + fwd_feed_map, + fwd_program, + fwd_op_desc, + ) = fwd_res + p = core.Place() + p.set_place(place) + grad_feed_map = {} + for arg in grad_op_desc.input_arg_names(): + if arg in fwd_feed_map.keys(): + grad_feed_map[arg] = fwd_feed_map[arg]._copy(p) + else: + fwd_var_name = op_grad_to_var.get(arg, None) + if fwd_var_name is None: + fwd_var_name = arg + + for i, out_name in enumerate(fwd_fetch_list): + if out_name == fwd_var_name: + # don't feed variables whose tensors hold no buffer (shape contains 0 like shape = [0,2,5] and holder_ is NULL), like XShape in reshape2 op. + # get them from global_scope directly since we have set them persistable in fwd execution + if 0 in fwd_program.global_block().var(out_name).shape: + continue + else: + grad_feed_map[arg] = fwd_outs[i]._copy(p) + + return grad_feed_map + + def _get_need_run_ops(self, op_desc, fwd_op_desc=None): + """Postorder traversal of the 'grad' tree to get all ops that need to run during inplace test. + An op needs to run during inplace check if, + (1) it has infer_inplace, + (2) it has infer_inplace in its grad descendants. (since we need its outputs as to construct its grad's inputs) + + Args: + op_desc (OpDesc): The op_desc of current op. + fwd_op_desc (OpDesc): The op_desc of current op's forward op, None if current op has no forward op. + E.g. relu's fwd_op is None, relu_grad's fwd_op is relu, relu_grad_grad's fwd_op is relu_grad, etc. + + Returns: + need_run_ops (list[(op_desc, fwd_op_desc)]): The ops that need to run during inplace test. + """ + need_run_ops = [] + visited_ops = [] + + def _dfs_grad_op(op_desc, fwd_op_desc=None): + visited_ops.append(op_desc.type()) + has_infer_inplace = base.core.has_infer_inplace(op_desc.type()) + has_grad_op_maker = base.core.has_grad_op_maker(op_desc.type()) + has_infer_inplace_in_grad_descendants = False + if not has_grad_op_maker: + has_infer_inplace_in_descendants = False + else: + # get grad_op_desc + grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc( + op_desc, set(), [] + ) + if not grad_op_desc_list: + has_infer_inplace_in_grad_descendants = False + else: + for i, grad_op_desc in enumerate(grad_op_desc_list): + if grad_op_desc.type() not in visited_ops and _dfs_grad_op( + grad_op_desc, fwd_op_desc=op_desc + ): + has_infer_inplace_in_grad_descendants = True + if has_infer_inplace or has_infer_inplace_in_grad_descendants: + need_run_ops.append((op_desc, fwd_op_desc)) + return True + else: + return False + + _dfs_grad_op(op_desc, fwd_op_desc=fwd_op_desc) + return need_run_ops + + def _check_forward_inplace(self, place, no_check_set=None, inplace_atol=None): + """Check the inplace correctness of given op (self.op_type). + Run the op twice with same inputs, one enable inplace and another disable, compare their outputs. + + Args: + place (CPUPlace | CUDAPlace): The place where the op runs. + no_check_set (list): The names of outputs that needn't check, like XShape of reshape op. + inplace_atol (float): The tolerable error, only set when op doesn't ensure computational consistency, like group_norm op. + + Returns: + expect_res (tuple(outs, fetch_list, feed_map, program, op_desc)): The results of given op. + We return this to construct grad_program and grad_feed_map for grad inplace check. + """ + # _calc_output() returns in the form tuple(outs, fetch_list, feed_map, program, op_desc) when for_inplace_test=True. + expect_res = self._calc_output( + place, + no_check_set=no_check_set, + enable_inplace=False, + for_inplace_test=True, + ) + actual_res = self._calc_output( + place, + no_check_set=no_check_set, + enable_inplace=True, + for_inplace_test=True, + ) + # compare expect_outs and actual_outs + self._compare_expect_and_actual_outputs( + place, + expect_res[1], + expect_res[0], + actual_res[0], + inplace_atol=inplace_atol, + ) + return expect_res + + def _calc_grad_output(self, place, fwd_res, grad_op_desc, enable_inplace=None): + """Calculate grad_output for given grad_op_desc. + + since we don`t really check gradient accuracy, but check the consistency when using and not using inplace, + we use fwd outs (also inputs sometimes) to construct grad inputs. + + Args: + place (CPUPlace | CUDAPlace): The place where the op runs. + fwd_res (tuple): The outputs of its forward op, in the same form as returns of _calc_outputs() when for_inplace_test is True. + i.e., tuple(fwd_outs, fwd_fetch_list, fwd_feed_map, fwd_program, fwd_op_desc). + grad_op_desc (OpDesc): The OpDesc of grad op. + enable_inplace (bool): Enable inplace or not. + + Returns: + res (tuple(outs, fetch_list, feed_map, program, op_desc)): The results of given grad_op_desc. + """ + with static_guard(): + ( + fwd_outs, + fwd_fetch_list, + fwd_feed_map, + fwd_program, + fwd_op_desc, + ) = fwd_res + grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc( + fwd_op_desc, set(), [] + ) + grad_program = self._construct_grad_program_from_forward( + fwd_program, grad_op_desc, op_grad_to_var + ) + grad_feed_map = self._construct_grad_feed_map_from_forward( + place, fwd_res, grad_op_desc, op_grad_to_var + ) + grad_fetch_list = grad_op_desc.output_arg_names() + exe = Executor(place) + program = grad_program + if enable_inplace is not None: + build_strategy = base.BuildStrategy() + build_strategy.enable_inplace = enable_inplace + compiled_program = base.CompiledProgram( + grad_program, build_strategy=build_strategy + ) + program = compiled_program + + outs = exe.run( + program, + feed=grad_feed_map, + fetch_list=grad_fetch_list, + return_numpy=False, + ) + return outs, grad_fetch_list, grad_feed_map, grad_program, grad_op_desc + + def _check_grad_inplace(self, place, fwd_res, grad_op_desc, inplace_atol=None): + """Check the inplace correctness of given grad_op_desc. + + Run the grad op twice with same inputs, one enable inplace and another disable, compare their outputs. + It works like _check_forward_inplace, but the way to construct program and feed_map differs. + So we define a new function for grad, grad_grad, etc. + + Args: + place (CPUPlace | CUDAPlace): The place where the op runs. + fwd_res (tuple): The outputs of its forward op, in the same form as returns of _calc_outputs() when for_inplace_test is True. + i.e., tuple(fwd_outs, fwd_fetch_list, fwd_feed_map, fwd_program, fwd_op_desc). + grad_op_desc (OpDesc): The OpDesc of grad op. + inplace_atol (float): The tolerable error, only set when op doesn't ensure computational consistency, like group_norm op. + + Returns: + expect_res (tuple(outs, fetch_list, feed_map, program, op_desc)): The results of given op. + We return this to construct grad_program and grad_feed_map for grad inplace check. + """ + expect_res = self._calc_grad_output( + place, fwd_res, grad_op_desc, enable_inplace=False + ) + actual_res = self._calc_grad_output( + place, fwd_res, grad_op_desc, enable_inplace=True + ) + + self._compare_expect_and_actual_outputs( + place, + expect_res[1], + expect_res[0], + actual_res[0], + inplace_atol=inplace_atol, + ) + return expect_res + + def check_inplace_output_with_place( + self, place, no_check_set=None, inplace_atol=None + ): + """Check the inplace correctness of given op, its grad op, its grad_grad op, etc. + + (1) Get all ops need to run. (see conditions in _get_need_run_ops()) + (2) Run op in need_run_ops, and do inplace check if it has infer_inplace. + + Args: + place (CPUPlace | CUDAPlace): The place where the op runs. + no_check_set (list): The names of outputs that needn't check, like XShape of reshape op. + inplace_atol (float): The tolerable error, only set when op doesn't ensure computational consistency, like group_norm op. + + Returns: + None + """ + if getattr(self, "no_need_check_inplace", False): + return + + if ( + os.getenv("FLAGS_enable_pir_in_executor") + or os.getenv("FLAGS_enable_pir_api") + or get_flags("FLAGS_enable_pir_in_executor")["FLAGS_enable_pir_in_executor"] + or get_flags("FLAGS_enable_pir_api")["FLAGS_enable_pir_api"] + ): + return + + has_infer_inplace = base.core.has_infer_inplace(self.op_type) + has_grad_op_maker = base.core.has_grad_op_maker(self.op_type) + fwd_res = self._calc_output( + place, no_check_set=no_check_set, for_inplace_test=True + ) + op_desc = fwd_res[4] + need_run_ops = self._get_need_run_ops(op_desc) + + res = {} + if hasattr(self, "attrs") and bool(self.attrs.get("use_xpu", False)): + return + for op_desc, father_op_desc in reversed(need_run_ops): + # The first one is the forward op + has_infer_inplace = base.core.has_infer_inplace(op_desc.type()) + if op_desc.type() == self.op_type: + if has_infer_inplace: + res[op_desc] = self._check_forward_inplace( + place, + no_check_set=no_check_set, + inplace_atol=inplace_atol, + ) + else: + res[op_desc] = self._calc_output( + place, no_check_set=no_check_set, for_inplace_test=True + ) + else: + # TODO(zhiqiu): enhance inplace_grad test for ops (sum and activation) using mkldnn + # skip op that use_mkldnn currently + flags_use_mkldnn = base.core.globals()["FLAGS_use_mkldnn"] + attrs_use_mkldnn = hasattr(self, "attrs") and bool( + self.attrs.get("use_mkldnn", False) + ) + if flags_use_mkldnn or attrs_use_mkldnn: + warnings.warn( + "check inplace_grad for ops using mkldnn is not supported" + ) + continue + if has_infer_inplace: + fwd_res = res[father_op_desc] + res[op_desc] = self._check_grad_inplace( + place, fwd_res, op_desc, inplace_atol=inplace_atol + ) + else: + res[op_desc] = self._calc_grad_output(place, fwd_res, op_desc) + + def check_output_with_place( + self, + place, + atol=0, + rtol=0, + no_check_set=None, + equal_nan=False, + check_dygraph=True, + check_prim=False, + check_prim_pir=False, + only_check_prim=False, + inplace_atol=None, + check_cinn=False, + check_pir=False, + check_auto_parallel=False, + check_pir_onednn=False, + check_symbol_infer=True, + ): + core._set_prim_all_enabled(False) + core.set_prim_eager_enabled(False) + if not self.is_mkldnn_op(): + set_flags({"FLAGS_use_mkldnn": False}) + + if hasattr(self, "use_custom_device") and self.use_custom_device: + check_dygraph = False + + def find_imperative_actual(target_name, dygraph_outs, place): + for name in dygraph_outs: + if name == target_name: + return dygraph_outs[name][0] + var_list = dygraph_outs[name] + for i, var in enumerate(var_list): + if isinstance(var, list): + for tensor in var: + if tensor.name == target_name: + return tensor + elif isinstance(var, paddle.Tensor) and var.name == target_name: + return dygraph_outs[name][i] + self.assertTrue( + False, + f"Found failed {dygraph_outs.keys()} {target_name}", + ) + + def find_imperative_expect(target_name, dygraph_outs, place): + for name in dygraph_outs: + if name == target_name: + return dygraph_outs[name][0] + var_list = dygraph_outs[name] + for i, var in enumerate(var_list): + if var.name == target_name: + return dygraph_outs[name][i] + self.assertTrue( + False, + f"Found failed {dygraph_outs.keys()} {target_name}", + ) + + def find_actual(target_name, fetch_list): + found = [ + i for i, var_name in enumerate(fetch_list) if var_name == target_name + ] + self.assertTrue(len(found) == 1, f"Found {len(found)} {target_name}") + return found[0] + + def find_expect(target_name, fetch_list): + found = [ + i for i, var_name in enumerate(fetch_list) if var_name == target_name + ] + self.assertTrue(len(found) == 1, f"Found {len(found)} {target_name}") + return found[0] + + class Checker: + """base class for check with self.outputs. + currently don't support check between checkers. + """ + + def __init__(self, op_test, expect_dict): + """expect_dict is the self.outputs + support : {str: [numpy]} and {str: [(str, numpy), (str, numpy)]} + """ + self.expects = expect_dict + self.checker_name = "checker" + self.op_test = op_test # stop the op_test object. + self.op_type = op_test.op_type + + def init(self): + pass + + def convert_uint16_to_float(self, actual_np, expect_np): + raise NotImplementedError("base class, not implement!") + + def calculate_output(self): + """ + judge whether convert current output and expect to uint16. + return True | False + """ + + def _is_skip_name(self, name): + if name not in self.expects: + return True + if no_check_set is not None and name in no_check_set: + return True + return False + + def find_actual_value(self, name): + """return: (actual_tensor(var_base), actual_numpy)""" + raise NotImplementedError("base class, not implement!") + + def find_expect_value(self, name): + """return: (expect_tensor(var_base), actual_numpy)""" + raise NotImplementedError("base class, not implement!") + + def _compare_numpy(self, name, actual_np, expect_np): + expect_np = np.array(expect_np) + assert ( + actual_np.shape == expect_np.shape + ), f"Operator ({self.op_type}) : Output ({name}) shape mismatch, expect shape is {expect_np.shape}, but actual shape is {actual_np.shape}" + np.testing.assert_allclose( + actual_np, + expect_np, + atol=self.atol if hasattr(self, "atol") else atol, + rtol=self.rtol if hasattr(self, "rtol") else rtol, + equal_nan=equal_nan, + err_msg=( + "Operator (" + + self.op_type + + ") Output (" + + name + + ") has diff at " + + str(place) + + " in " + + self.checker_name + ), + ) + + def compare_single_output_with_expect(self, name, expect): + actual, actual_np = self.find_actual_value(name) + # expect_np = expect[0] if isinstance(expect, tuple) else expect + if self.op_test.is_compared_with_fp32(): + expect, expect_np = self.find_expect_value(name) + else: + expect_np = ( + expect[0] if isinstance(expect, (tuple, list)) else expect + ) + actual_np, expect_np = self.convert_uint16_to_float_ifneed( + actual_np, expect_np + ) + # modify there for fp32 check + self._compare_numpy(name, actual_np, expect_np) + + def compare_outputs_with_expects(self): + for out_name, out_dup in Operator.get_op_outputs(self.op_type): + if self._is_skip_name(out_name): + continue + if out_dup: + # if self.output = {'name': [(subname, Tensor), (subname, Tensor)]} + sub_out = self.expects[out_name] + if not isinstance(sub_out, list): + raise AssertionError( + "sub_out type %s is not list", type(sub_out) + ) + for item in sub_out: + sub_out_name, expect = item[0], item[1] + self.compare_single_output_with_expect(sub_out_name, expect) + else: + expect = self.expects[out_name] + self.compare_single_output_with_expect(out_name, expect) + + def check(self): + """ + return None means ok, raise Error means failed. + + the main enter point of Checker class + """ + self.init() + self.calculate_output() + self.compare_outputs_with_expects() + + class StaticChecker(Checker): + def init(self): + self.checker_name = "static checker" + + def calculate_output(self): + outs, fetch_list = self.op_test._calc_output( + place, no_check_set=no_check_set, check_cinn=check_cinn + ) + self.outputs = outs + self.fetch_list = fetch_list + if self.op_test.is_compared_with_fp32(): + self.op_test.enable_cal_ref_output() + ref_outs, ref_fetch_list = self.op_test._calc_output( + place, no_check_set=no_check_set + ) + self.op_test.disable_cal_ref_output() + self.ref_outputs = ref_outs + self.ref_fetch_list = ref_fetch_list + + def find_actual_value(self, name): + idx = find_actual(name, self.fetch_list) + actual = self.outputs[idx] + actual_t = np.array(actual) + return actual, actual_t + + def find_expect_value(self, name): + idx = find_expect(name, self.ref_fetch_list) + expect = self.ref_outputs[idx] + expect_t = np.array(expect) + return expect, expect_t + + def convert_uint16_to_float_ifneed(self, actual_np, expect_np): + """ + judge whether convert current output and expect to uint16. + return True | False + """ + if actual_np.dtype == np.uint16: + if expect_np.dtype in [np.float32, np.float64]: + actual_np = convert_uint16_to_float(actual_np) + self.rtol = 1.0e-2 + elif actual_np.dtype == np.float16: + self.rtol = 1.0e-3 + else: + self.rtol = 1.0e-5 + if expect_np.dtype == np.uint16 and actual_np.dtype == np.uint16: + nonlocal atol + expect_np = convert_uint16_to_float(expect_np) + actual_np = convert_uint16_to_float(actual_np) + atol = max(atol, 0.03) + return actual_np, expect_np + + class DygraphChecker(Checker): + def init(self): + self.checker_name = "dygraph checker" + + def calculate_output(self): + # we only check end2end api when check_dygraph=True + self.is_python_api_test = True + dygraph_outs = self.op_test._calc_python_api_output(place) + if dygraph_outs is None: + self.is_python_api_test = False + # missing KernelSignature, fall back to eager middle output. + dygraph_outs = self.op_test._calc_dygraph_output( + place, no_check_set=no_check_set + ) + self.outputs = dygraph_outs + if self.op_test.is_compared_with_fp32(): + self.op_test.enable_cal_ref_output() + self.is_python_api_test = True + self.ref_outputs = self.op_test._calc_python_api_output(place) + if self.ref_outputs is None: + self.is_python_api_test = False + # missing KernelSignature, fall back to eager middle output. + self.ref_outputs = self.op_test._calc_dygraph_output( + place, no_check_set=no_check_set + ) + self.op_test.disable_cal_ref_output() + + def _compare_numpy(self, name, actual_np, expect_np): + expect_np = np.array(expect_np) + assert ( + actual_np.shape == expect_np.shape + ), f"Operator ({self.op_type}) : Output ({name}) shape mismatch, expect shape is {expect_np.shape}, but actual shape is {actual_np.shape}" + np.testing.assert_allclose( + actual_np, + expect_np, + atol=atol, + rtol=self.rtol if hasattr(self, "rtol") else rtol, + equal_nan=equal_nan, + err_msg=( + "Operator (" + + self.op_type + + ") Output (" + + name + + ") has diff at " + + str(place) + + " in " + + self.checker_name + ), + ) + + def convert_uint16_to_float_ifneed(self, actual_np, expect_np): + if actual_np.dtype == np.uint16: + self.rtol = 1.0e-2 + elif actual_np.dtype == np.float16: + self.rtol = 1.0e-3 + else: + self.rtol = 1.0e-5 + if self.op_test.is_bfloat16_op(): + if actual_np.dtype == np.uint16: + actual_np = convert_uint16_to_float(actual_np) + if expect_np.dtype == np.uint16: + expect_np = convert_uint16_to_float(expect_np) + return actual_np, expect_np + + def find_actual_value(self, name): + with base.dygraph.base.guard(place=place): + imperative_actual = find_imperative_actual( + name, self.outputs, place + ) + imperative_actual_t = np.array( + imperative_actual.value().get_tensor() + ) + return imperative_actual, imperative_actual_t + + def find_expect_value(self, name): + with base.dygraph.base.guard(place=place): + imperative_expect = find_imperative_expect( + name, self.ref_outputs, place + ) + imperative_expect_t = np.array( + imperative_expect.value().get_tensor() + ) + return imperative_expect, imperative_expect_t + + def _is_skip_name(self, name): + # if in final state and kernel signature don't have name, then skip it. + if ( + self.is_python_api_test + and hasattr(self.op_test, "python_out_sig") + and name not in self.op_test.python_out_sig + ): + return True + return super()._is_skip_name(name) + + class PirChecker(Checker): + def init(self): + self.checker_name = "pir checker" + + def calculate_output(self): + self.is_python_api_test = True + pir_outs = self.op_test._calc_pir_output(place) + if pir_outs is None: + self.is_python_api_test = False + # missing KernelSignature, fall back to eager middle output. + pir_outs = self.op_test._calc_dygraph_output( + place, no_check_set=no_check_set + ) + self.outputs = pir_outs + + if self.op_test.is_compared_with_fp32(): + self.op_test.enable_cal_ref_output() + self.is_python_api_test = True + self.ref_outputs = self.op_test._calc_pir_output(place) + if self.ref_outputs is None: + self.is_python_api_test = False + # missing KernelSignature, fall back to eager middle output. + self.ref_outputs = self.op_test._calc_dygraph_output( + place, no_check_set=no_check_set + ) + self.op_test.disable_cal_ref_output() + + def _compare_numpy(self, name, actual_np, expect_np): + expect_np = np.array(expect_np) + assert ( + actual_np.shape == expect_np.shape + ), f"Operator ({self.op_type}) : Output ({name}) shape mismatch, expect shape is {expect_np.shape}, but actual shape is {actual_np.shape}" + np.testing.assert_allclose( + actual_np, + expect_np, + atol=atol, + rtol=self.rtol if hasattr(self, "rtol") else rtol, + equal_nan=equal_nan, + err_msg=( + "Operator (" + + self.op_type + + ") Output (" + + name + + ") has diff at " + + str(place) + + " in " + + self.checker_name + ), + ) + + def convert_uint16_to_float_ifneed(self, actual_np, expect_np): + if actual_np.dtype == np.uint16: + self.rtol = 1.0e-2 + elif actual_np.dtype == np.float16: + self.rtol = 1.0e-3 + else: + self.rtol = 1.0e-5 + if self.op_test.is_bfloat16_op(): + if actual_np.dtype == np.uint16: + actual_np = convert_uint16_to_float(actual_np) + if expect_np.dtype == np.uint16: + expect_np = convert_uint16_to_float(expect_np) + return actual_np, expect_np + + def find_pir_actual(self, target_name, pir_outs, place): + for name in pir_outs: + if name == target_name: + return pir_outs[name][0] + + sub_dict = pir_outs[name][0] + if isinstance(sub_dict, dict): + for key, value in sub_dict.items(): + if key == target_name: + return value[0] + + raise AssertionError("No pir output named " + target_name) + + def find_pir_expect(self, target_name, dygraph_outs, place): + for name in dygraph_outs: + if name == target_name: + return dygraph_outs[name][0] + var_list = dygraph_outs[name] + for i, var in enumerate(var_list): + if isinstance(var, list): + for tensor in var: + if tensor.name == target_name: + return tensor + elif isinstance(var, paddle.Tensor) and var.name == target_name: + return dygraph_outs[name][i] + raise AssertionError("No pir ref_output named " + target_name) + + def find_actual_value(self, target_name): + with paddle.pir.core.program_guard( + paddle.pir.core.default_main_program() + ): + actual = self.find_pir_actual(target_name, self.outputs, place) + actual_t = np.array(actual) + return actual, actual_t + + def find_expect_value(self, target_name): + with paddle.pir.core.program_guard( + paddle.pir.core.default_main_program() + ): + expect = self.find_pir_expect(target_name, self.ref_outputs, place) + expect_t = np.array(expect) + return expect, expect_t + + def _is_skip_name(self, name): + # if in final state and kernel signature don't have name, then skip it. + if ( + self.is_python_api_test + and hasattr(self.op_test, "python_out_sig") + and name not in self.op_test.python_out_sig + ): + return True + return super()._is_skip_name(name) + + class SymbolInferChecker(Checker): + def check(self): + """return None means ok, raise Error means failed.""" + self.init() + self.infer_and_compare_symbol() + + def init(self): + self.checker_name = "symbol infer checker" + + def infer_and_compare_symbol(self): + """infer symbol and compare it with actualy shape and data""" + self.is_python_api_test = True + self.op_test._infer_and_compare_symbol(place) + + # set some flags by the combination of arguments. + if self.is_float16_op(): + self.dtype = np.float16 + self.__class__.dtype = self.dtype + self.output_dtype = np.float16 + elif self.is_bfloat16_op(): + self.dtype = np.uint16 + self.__class__.dtype = self.dtype + self.output_dtype = np.uint16 + else: + self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs) + if ( + self.dtype == np.float64 + and self.op_type + not in op_threshold_white_list.NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST + ): + atol = 0 + + if self.is_bfloat16_op(): + if self.is_mkldnn_op(): + check_dygraph = False + + if hasattr(self, "force_fp32_output") and self.force_fp32_output: + atol = max(atol, 0.01) + else: + atol = max(atol, 2) + else: + atol = max(atol, 0.01) + + if self.is_float16_op(): + atol = max(atol, 0.001) + + if no_check_set is not None: + if self.op_type not in no_check_set_white_list.no_check_set_white_list: + raise AssertionError( + f"no_check_set of op {self.op_type} must be set to None." + ) + + if check_prim: + with paddle.pir_utils.OldIrGuard(): + prim_checker = PrimForwardChecker(self, place) + prim_checker.check() + # Support operators which are not in the NO_FP64_CHECK_GRAD_OP_LIST list can be test prim with fp32 + self.__class__.check_prim = True + self.__class__.op_type = self.op_type + + if check_prim_pir: + with paddle.pir_utils.IrGuard(): + prim_checker = PrimForwardChecker(self, place) + prim_checker.check() + # Support operators which are not in the NO_FP64_CHECK_GRAD_OP_LIST list can be test prim with fp32 + self.__class__.check_prim_pir = True + self.__class__.op_type = self.op_type + if only_check_prim: + return + + if check_auto_parallel: + if is_ban_auto_parallel_test(place): + pass + else: + ( + forward_test_info_path, + generated_forward_test_path, + ) = get_test_info_and_generated_test_path( + self.__class__.__name__, self.op_type, backward=False + ) + with auto_parallel_test_guard( + forward_test_info_path, generated_forward_test_path + ): + dump_test_info(self, place, forward_test_info_path, backward=False) + python_api_info = { + "api_name": self.python_api.__name__, + "api_module": ( + inspect.getmodule(self.python_api).__name__ + if inspect.getmodule(self.python_api).__name__.startswith( + "paddle" + ) + else pathlib.Path( + inspect.getmodule(self.python_api).__file__ + ).stem + ), + } + # code gen for auto parallel forward test + gen_auto_parallel_test_file( + check_grad=False, + test_info_path=forward_test_info_path, + test_file_path=generated_forward_test_path, + python_api_info=python_api_info, + ) + runtime_envs = get_subprocess_runtime_envs(place) + start_command = get_subprocess_command( + runtime_envs["CUDA_VISIBLE_DEVICES"], + generated_forward_test_path, + log_dir=(self.log_dir if hasattr(self, "log_dir") else None), + ) + run_subprocess(start_command, runtime_envs, timeout=120) + + static_checker = StaticChecker(self, self.outputs) + static_checker.check() + outs, fetch_list = static_checker.outputs, static_checker.fetch_list + + if check_pir_onednn and isinstance(place, paddle.base.libpaddle.CPUPlace): + with pir_executor_guard(): + pir_onednn_static_checker = StaticChecker(self, self.outputs) + pir_onednn_static_checker.check() + + if check_dygraph: + dygraph_checker = DygraphChecker(self, self.outputs) + dygraph_checker.check() + dygraph_dygraph_outs = dygraph_checker.outputs + + if check_pir: + if ( + type(place) is paddle.base.libpaddle.CPUPlace + or type(place) is paddle.base.libpaddle.CUDAPlace + ): + with paddle.pir_utils.IrGuard(): + pir_checker = PirChecker(self, self.outputs) + pir_checker.check() + + if check_pir and check_symbol_infer: + if ( + type(place) is paddle.base.libpaddle.CPUPlace + or type(place) is paddle.base.libpaddle.CUDAPlace + ): + with paddle.pir_utils.IrGuard(): + symbol_checker = SymbolInferChecker(self, self.outputs) + symbol_checker.check() + + # Note(zhiqiu): inplace_atol should be only set when op doesn't ensure + # computational consistency. + # For example, group_norm uses AtomicAdd on CUDAPlace, which do not ensure + # computation order when multiple threads write the same address. So the + # result of group_norm is non-deterministic when datatype is float. + # When inplace_atol is not None, the inplace check uses numpy.allclose + # to check inplace result instead of numpy.array_equal. + if inplace_atol is not None: + warnings.warn( + "inplace_atol should only be set when op doesn't ensure computational consistency, please check it!" + ) + # Check inplace for given op, its grad op, its grad_grad op, etc. + # No effect on original OpTest + # Currently not support ParallelExecutor on XPUPlace. + if not paddle.is_compiled_with_xpu() and not isinstance( + place, core.CustomPlace + ): + self.check_inplace_output_with_place( + place, no_check_set=no_check_set, inplace_atol=inplace_atol + ) + + if check_dygraph: + return outs, dygraph_dygraph_outs, fetch_list + else: + return outs, fetch_list + + def check_compile_vs_runtime(self, fetch_list, fetch_outs): + def find_fetch_index(target_name, fetch_list): + found = [ + i for i, var_name in enumerate(fetch_list) if var_name == target_name + ] + if len(found) == 0: + return -1 + else: + self.assertTrue( + len(found) == 1, + f"Found {len(found)} {target_name}", + ) + return found[0] + + for name in self.op.desc.output_names(): + var_names = self.op.desc.output(name) + for var_name in var_names: + i = find_fetch_index(var_name, fetch_list) + if i == -1: + # The output is dispensable or intermediate. + break + out = fetch_outs[i] + if isinstance(out, DenseTensor): + lod_level_runtime = len(out.lod()) + else: + if isinstance(out, DenseTensorArray): + warnings.warn( + "The check of DenseTensorArray's lod_level is not implemented now!" + ) + lod_level_runtime = 0 + + var = self.program.global_block().var(var_name) + if var.type == DENSE_TENSOR: + lod_level_compile = var.lod_level + else: + lod_level_compile = 0 + self.assertEqual( + lod_level_compile, + lod_level_runtime, + "The lod_level of Output (" + + name + + ") is different between compile-time and runtime (" + + str(lod_level_compile) + + " vs " + + str(lod_level_runtime) + + ")", + ) + + def _get_places(self): + if self.dtype == np.float16 or self.dtype == "float16": + if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + return [place] + else: + return [] + else: + return [] + places = [] + cpu_only = self._cpu_only if hasattr(self, "_cpu_only") else False + if ( + os.environ.get("FLAGS_CI_both_cpu_and_gpu", "False").lower() + in [ + "1", + "true", + "on", + ] + or not ( + core.is_compiled_with_cuda() + and core.op_support_gpu(self.op_type) + and not cpu_only + ) + or self.op_type + in [ + "gaussian_random", + "lrn", + ] + ): + places.append(base.CPUPlace()) + if ( + core.is_compiled_with_cuda() + and core.op_support_gpu(self.op_type) + and not cpu_only + ): + places.append(core.CUDAPlace(0)) + return places + + def check_output( + self, + atol=1e-5, + rtol=1e-5, + no_check_set=None, + equal_nan=False, + check_dygraph=True, + check_prim=False, + check_prim_pir=False, + inplace_atol=None, + check_cinn=False, + only_check_prim=False, + check_pir=False, + check_auto_parallel=False, + check_pir_onednn=False, + check_symbol_infer=True, + ): + self.__class__.op_type = self.op_type + if self.is_mkldnn_op(): + self.__class__.use_mkldnn = True + + if self.is_xpu_op(): + self.__class__.use_xpu = True + + if hasattr(self, "use_custom_device") and self.use_custom_device: + check_dygraph = False + + places = self._get_places() + for place in places: + res = self.check_output_with_place( + place, + atol, + rtol, + no_check_set, + equal_nan, + check_dygraph=check_dygraph, + check_prim=check_prim, + check_prim_pir=check_prim_pir, + only_check_prim=only_check_prim, + inplace_atol=inplace_atol, + check_cinn=check_cinn, + check_pir=check_pir, + check_auto_parallel=check_auto_parallel, + check_pir_onednn=check_pir_onednn, + check_symbol_infer=check_symbol_infer, + ) + if not res and only_check_prim: + continue + if check_dygraph: + outs, dygraph_dygraph_outs, fetch_list = res + else: + outs, fetch_list = res + if ( + self.op_type + not in compile_vs_runtime_white_list.COMPILE_RUN_OP_WHITE_LIST + ): + if os.getenv("FLAGS_enable_pir_in_executor"): + return + self.check_compile_vs_runtime(fetch_list, outs) + + def check_output_customized(self, checker, custom_place=None, check_pir=False): + self.__class__.op_type = self.op_type + places = self._get_places() + if custom_place: + places.append(custom_place) + for place in places: + outs = self.calc_output(place) + outs = [np.array(out) for out in outs] + outs.sort(key=len) + checker(outs) + if check_pir: + with paddle.pir_utils.IrGuard(): + outs_p = self._calc_pir_output(place) + outs_p = [outs_p[out] for out in outs_p] + outs_p.sort(key=len) + checker(outs_p[0]) + + def check_output_with_place_customized(self, checker, place, check_pir=False): + outs = self.calc_output(place) + outs = [np.array(out) for out in outs] + outs.sort(key=len) + checker(outs) + if check_pir: + with paddle.pir_utils.IrGuard(): + outs_p = self._calc_pir_output(place) + outs_p = [outs_p[out][0] for out in outs_p] + outs_p.sort(key=len) + checker(outs_p) + + def _assert_is_close( + self, + numeric_grads, + analytic_grads, + names, + max_relative_error, + msg_prefix, + atol=1e-5, + ): + for a, b, name in zip(numeric_grads, analytic_grads, names): + assert tuple(a.shape) == tuple( + b.shape + ), f"Operator ({self.op_type}) : Output ({name}) gradient shape mismatch, expect shape is {a.shape}, but actual shape is {b.shape}" + # Used by bfloat16 for now to solve precision problem + if self.is_bfloat16_op(): + if a.size == 0: + self.assertTrue(b.size == 0) + np.testing.assert_allclose( + b, + a, + rtol=max_relative_error, + atol=atol, + equal_nan=False, + err_msg=( + f"Operator {self.op_type} error, {msg_prefix} variable {name} (shape: {a.shape}, dtype: {self.dtype}) max gradient diff over limit" + ), + ) + else: + # It asserts np.abs(a - b) / np.abs(a) < max_relative_error, in which + # max_relative_error is 1e-7. According to the value of np.abs(a), we + # change np.abs(a) to achieve dynamic threshold. For example, if + # the value of np.abs(a) is between 1e-10 and 1e-8, we set np.abs(a)*=1e4. + # Therefore, it asserts np.abs(a - b) / (np.abs(a)*1e4) < max_relative_error, + # which is the same as np.abs(a - b) / np.abs(a) < max_relative_error*1e4. + abs_a = np.abs(a) + if abs_a.ndim > 0: + if ( + self.dtype == np.float64 + and self.op_type + not in op_threshold_white_list.NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST + ): + abs_a[abs_a < 1e-10] = 1e-3 + abs_a[np.logical_and(abs_a > 1e-10, abs_a <= 1e-8)] *= 1e4 + abs_a[np.logical_and(abs_a > 1e-8, abs_a <= 1e-6)] *= 1e2 + elif self.is_bfloat16_op(): + abs_a[abs_a < 1e-2] = 1 + else: + abs_a[abs_a < 1e-3] = 1 + elif abs_a.ndim == 0: + if ( + self.dtype == np.float64 + and self.op_type + not in op_threshold_white_list.NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST + ): + if abs_a < 1e-10: + abs_a = 1e-3 + elif abs_a > 1e-10 and abs_a <= 1e-8: + abs_a = abs_a * 1e4 + elif abs_a > 1e-8 and abs_a <= 1e-6: + abs_a = abs_a * 1e2 + elif self.is_bfloat16_op(): + abs_a = 1 if abs_a < 1e-2 else abs_a + else: + abs_a = 1 if abs_a < 1e-3 else abs_a + + if self.dtype == np.bool_: + diff_mat = np.abs(a ^ b) / abs_a + else: + diff_mat = np.abs(a - b) / abs_a + max_diff = np.max(diff_mat) + + def err_msg(): + offset = np.argmax(diff_mat > max_relative_error) + return ( + "Operator %s error, %s variable %s (shape: %s, dtype: %s) max gradient diff %e over limit %e, " + "the first error element is %d, expected %e, but got %e." + ) % ( + self.op_type, + msg_prefix, + name, + str(a.shape), + self.dtype, + max_diff, + max_relative_error, + offset, + a.flatten()[offset], + b.flatten()[offset], + ) + + self.assertLessEqual(max_diff, max_relative_error, err_msg()) + + def _check_grad_helper(self): + if self.is_float16_op(): + self.dtype = np.float16 + self.__class__.dtype = self.dtype + self.output_dtype = np.float16 + elif self.is_bfloat16_op(): + self.dtype = np.uint16 + self.__class__.dtype = self.dtype + self.output_dtype = np.uint16 + else: + self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs) + self.__class__.op_type = self.op_type + self.__class__.exist_check_grad = True + if self.dtype == np.float64: + self.__class__.exist_fp64_check_grad = True + + def check_grad( + self, + inputs_to_check, + output_names, + no_grad_set=None, + numeric_grad_delta=0.005, + in_place=False, + max_relative_error=0.005, + user_defined_grads=None, + user_defined_grad_outputs=None, + check_dygraph=True, + check_prim=False, + check_prim_pir=False, + only_check_prim=False, + atol=1e-5, + check_cinn=False, + check_pir=False, + check_auto_parallel=False, + check_pir_onednn=False, + ): + if hasattr(self, "use_custom_device") and self.use_custom_device: + check_dygraph = False + + self._check_grad_helper() + places = self._get_places() + for place in places: + self.check_grad_with_place( + place, + inputs_to_check, + output_names, + no_grad_set, + numeric_grad_delta, + in_place, + max_relative_error, + user_defined_grads, + user_defined_grad_outputs, + check_dygraph=check_dygraph, + check_prim=check_prim, + check_prim_pir=check_prim_pir, + only_check_prim=only_check_prim, + atol=atol, + check_cinn=check_cinn, + check_pir=check_pir, + check_auto_parallel=check_auto_parallel, + check_pir_onednn=check_pir_onednn, + ) + + def check_grad_with_place_for_static( + self, + user_defined_grads, + inputs_to_check, + place, + output_names, + no_grad_set, + user_defined_grad_outputs, + numeric_place, + numeric_grad_delta, + in_place, + check_cinn, + max_relative_error, + atol, + ): + if user_defined_grads is None and self.is_compared_with_fp32(): + self.enable_cal_ref_output() + numeric_grads = self._get_gradient( + inputs_to_check, + place, + output_names, + no_grad_set, + user_defined_grad_outputs, + ) + self.disable_cal_ref_output() + else: + numeric_grads = user_defined_grads or [ + get_numeric_gradient( + numeric_place, + self.scope, + self.op, + self.inputs, + input_to_check, + output_names, + delta=numeric_grad_delta, + in_place=in_place, + ) + for input_to_check in inputs_to_check + ] + + analytic_grads = self._get_gradient( + inputs_to_check, + place, + output_names, + no_grad_set, + user_defined_grad_outputs, + check_cinn=check_cinn, + ) + # comparison of bf16 results will happen as fp32 + # loop over list of grads and convert bf16 to fp32 + + fp32_analytic_grads = [] + for grad in analytic_grads: + if grad.dtype == np.uint16: + grad = convert_uint16_to_float(grad) + max_relative_error = max(max_relative_error, 0.01) + fp32_analytic_grads.append(grad) + analytic_grads = fp32_analytic_grads + + fp32_numeric_grads = [] + for grad in numeric_grads: + if grad.dtype == np.uint16: + grad = convert_uint16_to_float(grad) + max_relative_error = max(max_relative_error, 0.01) + fp32_numeric_grads.append(grad) + numeric_grads = fp32_numeric_grads + + if self.is_float16_op(): + max_relative_error = max(max_relative_error, 0.001) + self._assert_is_close( + numeric_grads, + analytic_grads, + inputs_to_check, + max_relative_error, + f"Gradient Check On {place}", + atol=atol, + ) + + return numeric_grads + + def check_grad_with_place( + self, + place, + inputs_to_check, + output_names, + no_grad_set=None, + numeric_grad_delta=0.005, + in_place=False, + max_relative_error=0.005, + user_defined_grads=None, + user_defined_grad_outputs=None, + check_dygraph=True, + check_prim=False, + check_prim_pir=False, + only_check_prim=False, + numeric_place=None, + atol=1e-5, + check_cinn=False, + check_pir=False, + check_auto_parallel=False, + check_pir_onednn=False, + ): + if hasattr(self, "use_custom_device") and self.use_custom_device: + check_dygraph = False + + if not self.is_mkldnn_op(): + set_flags({"FLAGS_use_mkldnn": False}) + + core._set_prim_all_enabled(False) + core.set_prim_eager_enabled(False) + if check_prim: + with paddle.pir_utils.OldIrGuard(): + self._check_grad_helper() + prim_grad_checker = PrimGradChecker( + self, + place, + inputs_to_check, + output_names, + no_grad_set, + user_defined_grad_outputs, + ) + prim_grad_checker.check() + # Support operators which are not in the NO_FP64_CHECK_GRAD_OP_LIST list can be test prim with fp32 + self.__class__.check_prim = True + + if check_prim_pir: + with paddle.pir_utils.IrGuard(): + self._check_grad_helper() + prim_grad_checker = PrimGradChecker( + self, + place, + inputs_to_check, + output_names, + no_grad_set, + user_defined_grad_outputs, + ) + prim_grad_checker.check() + # Support operators which are not in the NO_FP64_CHECK_GRAD_OP_LIST list can be test prim with fp32 + self.__class__.check_prim_pir = True + + if only_check_prim: + return + + if check_auto_parallel: + if is_ban_auto_parallel_test(place): + pass + else: + ( + grad_test_info_path, + generated_grad_test_path, + ) = get_test_info_and_generated_test_path( + self.__class__.__name__, self.op_type, backward=True + ) + with auto_parallel_test_guard( + grad_test_info_path, generated_grad_test_path + ): + backward_extra_test_info = {} + backward_extra_test_info["inputs_to_check"] = inputs_to_check + backward_extra_test_info["output_names"] = output_names + backward_extra_test_info["no_grad_set"] = no_grad_set + backward_extra_test_info[ + "user_defined_grad_outputs" + ] = user_defined_grad_outputs + dump_test_info( + self, + place, + grad_test_info_path, + backward=True, + backward_extra_test_info=backward_extra_test_info, + ) + python_api_info = { + "api_name": self.python_api.__name__, + "api_module": ( + inspect.getmodule(self.python_api).__name__ + if inspect.getmodule(self.python_api).__name__.startswith( + "paddle" + ) + else pathlib.Path( + inspect.getmodule(self.python_api).__file__ + ).stem + ), + } + # code gen for auto parallel grad test + gen_auto_parallel_test_file( + check_grad=False, + test_info_path=grad_test_info_path, + test_file_path=generated_grad_test_path, + python_api_info=python_api_info, + ) + runtime_envs = get_subprocess_runtime_envs(place) + + num_devices = len(runtime_envs["CUDA_VISIBLE_DEVICES"].split(",")) + if num_devices > paddle.device.cuda.device_count(): + self.skipTest("number of GPUs is not enough") + + start_command = get_subprocess_command( + runtime_envs["CUDA_VISIBLE_DEVICES"], + generated_grad_test_path, + log_dir=(self.log_dir if hasattr(self, "log_dir") else None), + ) + run_subprocess(start_command, runtime_envs, timeout=120) + + self.scope = core.Scope() + op_inputs = self.inputs if hasattr(self, "inputs") else {} + op_outputs = self.outputs if hasattr(self, "outputs") else {} + op_attrs = self.attrs if hasattr(self, "attrs") else {} + self._check_grad_helper() + if self.is_bfloat16_op(): + if self.is_mkldnn_op(): + check_dygraph = False + atol = max(atol, 0.01) + + if self.is_float16_op(): + atol = max(atol, 0.001) + + if ( + self.dtype == np.float64 + and self.op_type + not in op_threshold_white_list.NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST + ): + numeric_grad_delta = 1e-5 + max_relative_error = 1e-7 + + cache_list = None + if hasattr(self, "cache_name_list"): + cache_list = self.cache_name_list + + # oneDNN numeric gradient should use CPU kernel + use_onednn = False + if op_attrs.get("use_mkldnn"): + op_attrs["use_mkldnn"] = False + use_onednn = True + if hasattr(self, "attrs"): + for k, v in self.attrs.items(): + if isinstance(v, paddle.base.core.DataType): + self.attrs[k] = paddle.pir.core.datatype_to_vartype[v] + + self.op = create_op( + self.scope, + self.op_type, + op_inputs, + op_outputs, + op_attrs, + cache_list=cache_list, + ) + + if use_onednn: + op_attrs["use_mkldnn"] = True + + if no_grad_set is None: + no_grad_set = set() + else: + if ( + (self.op_type not in no_grad_set_white_list.NEED_TO_FIX_OP_LIST) + and (self.op_type not in no_grad_set_white_list.NOT_CHECK_OP_LIST) + and (not self.is_bfloat16_op()) + ): + raise AssertionError( + "no_grad_set must be None, op_type is " + self.op_type + " Op." + ) + + for input_to_check in inputs_to_check: + set_input(self.scope, self.op, self.inputs, place) + tensor_to_check = self.scope.find_var(input_to_check).get_tensor() + tensor_size = functools.reduce( + lambda a, b: a * b, tensor_to_check.shape(), 1 + ) + tensor_ndim = len(tensor_to_check.shape()) + # for 0D Tensor, it's additional case for OP, so not raise error + if tensor_ndim > 0 and tensor_size < 100: + self.__class__.input_shape_is_large = False + + if type(output_names) is not list: + output_names = [output_names] + + if numeric_place is None: + numeric_place = place + + with paddle.pir_utils.OldIrGuard(): + numeric_grads = self.check_grad_with_place_for_static( + user_defined_grads, + inputs_to_check, + place, + output_names, + no_grad_set, + user_defined_grad_outputs, + numeric_place, + numeric_grad_delta, + in_place, + check_cinn, + max_relative_error, + atol, + ) + + if check_pir_onednn and isinstance(place, paddle.base.libpaddle.CPUPlace): + with pir_executor_guard(): + self.check_grad_with_place_for_static( + user_defined_grads, + inputs_to_check, + place, + output_names, + no_grad_set, + user_defined_grad_outputs, + numeric_place, + numeric_grad_delta, + in_place, + check_cinn, + max_relative_error, + atol, + ) + + if check_dygraph: + with base.dygraph.base.guard(place): + dygraph_dygraph_grad = self._get_dygraph_grad( + inputs_to_check, + place, + output_names, + user_defined_grad_outputs, + no_grad_set, + check_dygraph, + ) + fp32_grads = [] + for grad in dygraph_dygraph_grad: + if grad.dtype == np.uint16: + grad = convert_uint16_to_float(grad) + max_relative_error = max(max_relative_error, 0.03) + fp32_grads.append(grad) + dygraph_dygraph_grad = fp32_grads + self._assert_is_close( + numeric_grads, + dygraph_dygraph_grad, + inputs_to_check, + max_relative_error, + f"Gradient Check On {place}", + atol=atol, + ) + + # get pir gradient + if check_pir: + if ( + type(place) is paddle.base.libpaddle.CPUPlace + or type(place) is paddle.base.libpaddle.CUDAPlace + ): + with paddle.pir_utils.IrGuard(): + pir_grad = self._get_ir_gradient( + inputs_to_check, + place, + output_names, + user_defined_grad_outputs, + no_grad_set, + ) + fp32_analytic_grads = [] + for grad in pir_grad: + if grad.dtype == np.uint16: + grad = convert_uint16_to_float(grad) + max_relative_error = max(max_relative_error, 0.01) + fp32_analytic_grads.append(grad) + pir_grad = fp32_analytic_grads + if self.is_float16_op(): + max_relative_error = max(max_relative_error, 0.01) + self._assert_is_close( + numeric_grads, + pir_grad, + inputs_to_check, + max_relative_error, + f"Gradient Check On {place}", + atol=atol, + ) + + def _find_var_in_dygraph(self, output_vars, name): + if name in output_vars: + return output_vars[name] + else: + for output_vars_index in output_vars: + for output_vars_selected in output_vars[output_vars_index]: + if isinstance(output_vars_selected, list): + for tensor in output_vars_selected: + if tensor.name == name: + return [tensor] + elif isinstance(output_vars_selected, paddle.Tensor): + if output_vars_selected.name == name: + return [output_vars_selected] + raise AssertionError(name, " not in outputs:", output_vars.keys()) + + def _get_dygraph_grad( + self, + inputs_to_check, + place, + output_names, + user_defined_grad_outputs=None, + no_grad_set=None, + check_dygraph=True, + ): + if hasattr(self, "use_custom_device") and self.use_custom_device: + check_dygraph = False + + with base.dygraph.base.guard(place=place): + block = base.framework.default_main_program().global_block() + + op_proto = OpProtoHolder.instance().get_op_proto(self.op_type) + + # prepare input variable + inputs, inputs_grad_dict = self.append_input_output_for_dygraph( + op_proto, self.inputs, True, True, block + ) + + # prepare output variable + outputs = self.append_input_output_for_dygraph( + op_proto, self.outputs, False, False, block + ) + + # prepare attributes + attrs_outputs = {} + if hasattr(self, "attrs"): + for attrs_name in self.attrs: + if self.attrs[attrs_name] is not None: + attrs_outputs[attrs_name] = self.attrs[attrs_name] + + if check_dygraph: + dygraph_outputs = self._calc_python_api_output(place, inputs, outputs) + if dygraph_outputs is None: + # missing KernelSignature, fall back to eager middle output. + dygraph_outputs = self._calc_dygraph_output( + place, egr_inps=inputs, egr_oups=outputs + ) + + outputs = dygraph_outputs + + if self.dtype == np.uint16: + cast_inputs = [] + for output_name in output_names: + cast_input = self._find_var_in_dygraph(outputs, output_name) + cast_inputs = cast_inputs + cast_input + cast_outputs = [] + for cast_input in cast_inputs: + if isinstance(cast_input, paddle.Tensor): + cast_outputs.append(paddle.cast(cast_input, paddle.float32)) + else: + raise TypeError( + f"Unsupported test data type {type(cast_input)}." + ) + + outputs = {} + for i in range(len(output_names)): + outputs.update({output_names[i]: [cast_outputs[i]]}) + outputs_valid = {} + for output_name in output_names: + outputs_valid[output_name] = self._find_var_in_dygraph( + outputs, output_name + ) + + if user_defined_grad_outputs is None: + if len(outputs_valid) == 1: + for outputs_valid_key in outputs_valid: + loss = paddle.mean(outputs_valid[outputs_valid_key][0]) + else: + avg_sum = [] + for cur_loss in outputs_valid: + cur_avg_loss = paddle.mean(outputs_valid[cur_loss][0]) + avg_sum.append(cur_avg_loss) + loss_sum = paddle.add_n(avg_sum) + loss = paddle.scale(loss_sum, scale=1.0 / float(len(avg_sum))) + loss.backward() + + fetch_list_grad = [] + for inputs_to_check_name in inputs_to_check: + a = inputs_grad_dict[inputs_to_check_name].gradient() + fetch_list_grad.append(a) + return fetch_list_grad + else: + # user_defined_grad_outputs here are numpy arrays + if not isinstance(user_defined_grad_outputs, list): + user_defined_grad_outputs = [user_defined_grad_outputs] + grad_outputs = [] + for grad_out_value in user_defined_grad_outputs: + grad_outputs.append(paddle.to_tensor(grad_out_value)) + # delete the inputs which no need to calculate grad + for no_grad_val in no_grad_set: + del inputs[no_grad_val] + grad_inputs = paddle.grad( + outputs=paddle.utils.flatten(outputs), + inputs=paddle.utils.flatten(inputs), + grad_outputs=grad_outputs, + ) + return [grad.numpy(False) for grad in grad_inputs] + + @staticmethod + def _numpy_to_lod_tensor(np_value, lod, place): + tensor = core.DenseTensor() + tensor.set(np_value, place) + if lod is not None: + tensor.set_recursive_sequence_lengths(lod) + return tensor + + @staticmethod + def np_dtype_to_base_dtype(input): + return input + + @staticmethod + def base_dtype_to_np_dtype(self, dtype): + return dtype + + @staticmethod + def np_value_to_base_value(input): + return input + + def cast_bf16_output(self, block, cast_inputs): + output_names = [] + for i in range(0, len(cast_inputs)): + cast_output = block.create_var(dtype="float32", shape=cast_inputs[i].shape) + cast_op = block.append_op( + inputs={"X": cast_inputs[i]}, + outputs={"Out": cast_output}, + type="cast", + attrs={ + "in_dtype": core.VarDesc.VarType.BF16, + "out_dtype": core.VarDesc.VarType.FP32, + }, + ) + cast_op.desc.infer_var_type(block.desc) + cast_op.desc.infer_shape(block.desc) + output_names.append(cast_output.name) + return output_names + + def _check_ir_grad_output( + self, place, program, scope, feed_dict, fetch_list, gradients + ): + if os.getenv("FLAGS_PIR_OPTEST") is None: + return + if os.getenv("FLAGS_PIR_OPTEST_WHITE_LIST") is None: + return + if self.check_prim or self.check_prim_pir: + return + if self._check_cinn: + return + + stored_flag = get_flags( + [ + "FLAGS_enable_pir_in_executor", + "FLAGS_pir_apply_inplace_pass", + ] + ) + try: + set_flags( + { + "FLAGS_enable_pir_in_executor": True, + "FLAGS_pir_apply_inplace_pass": 0, + } + ) + executor = Executor(place) + new_gradients = list( + map( + np.array, + executor.run( + program, + feed_dict, + fetch_list, + scope=scope, + return_numpy=False, + ), + ) + ) + + check_method = np.testing.assert_array_equal + if os.getenv("FLAGS_PIR_OPTEST_RELAX_CHECK", None) == "True": + + def relaxed_check_method(x, y, err_msg): + atol = 1e-6 + rtol = 1e-6 + if x.dtype == np.float16: + atol = 1e-5 + rtol = 1e-3 + np.testing.assert_allclose( + x, y, err_msg=err_msg, atol=atol, rtol=rtol + ) + + check_method = relaxed_check_method + + if os.getenv("FLAGS_PIR_NO_CHECK", None) == "True": + + def no_check_method(x, y, err_msg): + pass + + check_method = no_check_method + + for i in range(len(new_gradients)): + check_method( + gradients[i], + new_gradients[i], + err_msg="Operator GradCheck (" + + self.op_type + + ") has diff at " + + str(place) + + "\nExpect " + + str(gradients[i]) + + "\n" + + "But Got" + + str(new_gradients[i]) + + " in class " + + self.__class__.__name__, + ) + finally: + set_flags(stored_flag) + + def _get_gradient( + self, + input_to_check, + place, + output_names, + no_grad_set, + user_defined_grad_outputs=None, + parallel=False, + check_cinn=False, + ): + with paddle.pir_utils.OldIrGuard(): + prog = Program() + scope = core.Scope() + ir_scope = core.Scope() + block = prog.global_block() + self._append_ops(block) + + inputs = self._get_inputs(block) + outputs = self._get_outputs(block) + feed_dict = self.feed_var(inputs, place) + + if user_defined_grad_outputs is None: + if self.dtype == np.uint16 and not self.is_calc_ref: + cast_inputs = list(map(block.var, output_names)) + if self.op_type in ["broadcast_tensors", "meshgrid"]: + output_names = self.cast_bf16_output(block, cast_inputs) + else: + cast_outputs = block.create_var( + dtype="float32", shape=cast_inputs[0].shape + ) + cast_op = block.append_op( + inputs={"X": cast_inputs}, + outputs={"Out": cast_outputs}, + type="cast", + attrs={ + "in_dtype": core.VarDesc.VarType.BF16, + "out_dtype": core.VarDesc.VarType.FP32, + }, + ) + cast_op.desc.infer_var_type(block.desc) + cast_op.desc.infer_shape(block.desc) + output_names = [cast_outputs.name] + loss = append_loss_ops(block, output_names) + param_grad_list = append_backward( + loss=loss, + parameter_list=input_to_check, + no_grad_set=no_grad_set, + ) + fetch_list = [g for p, g in param_grad_list] + else: + assert ( + parallel is False + ), "unsupported parallel mode when giving custom grad outputs." + # user_defined_grad_outputs here are numpy arrays + if not isinstance(user_defined_grad_outputs, list): + user_defined_grad_outputs = [user_defined_grad_outputs] + grad_outputs = [] + for grad_out_value in user_defined_grad_outputs: + # `persistable` is used to avoid executor create new var in local scope + var = block.create_var( + shape=grad_out_value.shape, + dtype=grad_out_value.dtype, + persistable=True, + ) + true_var = scope.var(var.name) + tensor = true_var.get_tensor() + tensor.set(grad_out_value, place) + grad_outputs.append(var) + if os.getenv("FLAGS_PIR_OPTEST") is not None: + ir_true_var = ir_scope.var(var.name) + ir_tensor = ir_true_var.get_tensor() + ir_tensor.set(grad_out_value, place) + + targets = [outputs[name] for name in outputs if name in output_names] + inputs = [inputs[name] for name in input_to_check if name in inputs] + grad_inputs = paddle.static.gradients( + targets, inputs, grad_outputs, no_grad_set + ) + fetch_list = [grad.name for grad in grad_inputs] + + enable_cinn_test = check_cinn and self._enable_check_cinn_test( + place, feed_dict, outputs + ) + if enable_cinn_test: + if hasattr(self, "cinn_atol"): + self.atol = self.cinn_atol + if hasattr(self, "cinn_rtol"): + self.rtol = self.cinn_rtol + + if parallel or enable_cinn_test: + use_cuda = False + if isinstance(place, base.CUDAPlace): + use_cuda = True + + build_strategy = None + if enable_cinn_test: + build_strategy = base.BuildStrategy() + build_strategy.build_cinn_pass = check_cinn + self._check_cinn = True + + compiled_prog = base.CompiledProgram(prog, build_strategy) + prog = compiled_prog + executor = base.Executor(place) + res = list( + map( + np.array, + executor.run( + prog, + feed_dict, + fetch_list, + scope=scope, + return_numpy=False, + ), + ) + ) + + self._check_ir_grad_output( + place, prog, ir_scope, feed_dict, fetch_list, res + ) + + return res + + def _find_var_in_pir(self, output_vars, target_name): + for name in output_vars: + if name == target_name: + return output_vars[name] + + sub_dict = output_vars[name][0] + if isinstance(sub_dict, dict): + for key, value in sub_dict.items(): + if key == target_name: + return value + raise AssertionError(target_name, " not in outputs:", output_vars.keys()) + + def _get_ir_gradient( + self, + inputs_to_check, + place, + output_names, + user_defined_grad_outputs=None, + no_grad_set=None, + ): + def construct_output_dict_by_kernel_sig(ret_tuple, output_sig): + if hasattr(self, "python_out_sig"): + output_sig = self.python_out_sig + if not isinstance(ret_tuple, (tuple, list)): + ret_tuple = [ret_tuple] + if len(output_sig) == len(ret_tuple): + # [assumption]: we assume {"Out": [Tensor]} + return {a: [b] for a, b in zip(output_sig, ret_tuple)} + else: + # [assumption]: return multi-Tensor in a single output. such as paddle.split() + assert ( + len(output_sig) == 1 + ), "Don't support multi-output with multi-tensor output. (May be you can use set `python_out_sig`, see `test_squeeze2_op` as a example.)" + return {output_sig[0]: ret_tuple} + + # get kernel signature + kernel_sig = self.get_kernel_signature(place) + ir_program = paddle.static.Program() + with paddle.static.program_guard(ir_program): + with scope_guard(Scope()): + # prepare inps attributes feed + ( + static_inputs, + attrs, + inputs_dict, + feed, + ) = self.get_ir_input_attr_dict_and_feed(stop_gradient=False) + # prepare args + args = OpTestUtils.prepare_python_api_arguments( + self.python_api, + static_inputs, + attrs, + kernel_sig, + target_dtype=paddle.pir.core.DataType, + ) + inputs_sig, attrs_sig, outputs_sig = kernel_sig + args = OpTestUtils.assumption_assert_and_transform( + args, len(inputs_sig) + ) + grad_outputs = [] + if user_defined_grad_outputs is not None: + # user_defined_grad_outputs here are numpy arrays + if not isinstance(user_defined_grad_outputs, list): + user_defined_grad_outputs = [user_defined_grad_outputs] + for grad_out_value, idx in zip( + user_defined_grad_outputs, + range(len(user_defined_grad_outputs)), + ): + grad_val = paddle.static.data( + name=f"val_grad_{idx}", + shape=grad_out_value.shape, + dtype=grad_out_value.dtype, + ) + grad_outputs.append(grad_val) + feed.update({f"val_grad_{idx}": grad_out_value}) + # delete the inputs which no need to calculate grad + for no_grad_val in no_grad_set: + del static_inputs[no_grad_val] + + ret_tuple = self.python_api(*args) + outputs = construct_output_dict_by_kernel_sig(ret_tuple, outputs_sig) + if hasattr(self, "python_out_sig_sub_name"): + for key in self.python_out_sig_sub_name.keys(): + outputs[key][0] = { + a: [b] + for a, b in zip( + self.python_out_sig_sub_name[key], + outputs[key][0], + ) + } + fetch_list = getattr(self, "fetch_list", []) + + # cast outputs + if self.dtype == np.uint16: + cast_inputs = [] + for output_name in output_names: + cast_input = self._find_var_in_pir(outputs, output_name) + cast_inputs = cast_inputs + cast_input + cast_outputs = [] + for cast_input in cast_inputs: + if isinstance(cast_input, paddle.base.libpaddle.pir.Value): + cast_outputs.append( + paddle.cast( + cast_input, + paddle.base.core.DataType.FLOAT32, + ) + ) + else: + raise TypeError( + f"Unsupported test data type {type(cast_input)}." + ) + + outputs = {} + for i in range(len(output_names)): + outputs.update({output_names[i]: [cast_outputs[i]]}) + + outputs_valid = {} + for output_name in output_names: + outputs_valid[output_name] = self._find_var_in_pir( + outputs, output_name + ) + loss_inputs = [] + for input_name in inputs_to_check: + loss_inputs.append(inputs_dict[input_name]) + + if user_defined_grad_outputs is None: + if len(outputs_valid) == 1: + for outputs_valid_key in outputs_valid: + loss = paddle.mean(outputs_valid[outputs_valid_key][0]) + else: + avg_sum = [] + for cur_loss in outputs_valid: + cur_avg_loss = paddle.mean(outputs_valid[cur_loss][0]) + avg_sum.append(cur_avg_loss) + loss_sum = paddle.add_n(avg_sum) + loss = paddle.scale(loss_sum, scale=1.0 / float(len(avg_sum))) + + grad_inputs = ir_grad( + outputs=paddle.utils.flatten(loss), + inputs=paddle.utils.flatten(loss_inputs), + grad_outputs=None, + ) + else: + grad_inputs = ir_grad( + outputs=paddle.utils.flatten(outputs), + inputs=paddle.utils.flatten(static_inputs), + grad_outputs=grad_outputs, + ) + fetch_list = list(grad_inputs) + # executor run + executor = paddle.static.Executor(place) + outs = executor.run( + ir_program, + feed=feed, + fetch_list=fetch_list, + ) + return outs + + +class OpTestTool: + @classmethod + def skip_if(cls, condition: object, reason: str): + return unittest.skipIf(condition, reason) + + @classmethod + def skip_if_not_cpu_bf16(cls): + return OpTestTool.skip_if( + not ( + isinstance(_current_expected_place(), core.CPUPlace) + and core.supports_bfloat16() + ), + "Place does not support BF16 evaluation", + ) + + @classmethod + def skip_if_not_cpu(cls): + return OpTestTool.skip_if( + not isinstance(_current_expected_place(), core.CPUPlace), + "OneDNN supports only CPU for now", + ) diff --git a/python/tests/orig_op_test.py b/python/tests/orig_op_test.py new file mode 120000 index 0000000000..adcdd3a239 --- /dev/null +++ b/python/tests/orig_op_test.py @@ -0,0 +1 @@ +../../Paddle/test/legacy_test/op_test.py \ No newline at end of file