PaddlePaddle
diff --git a/‎csrc/generation/flash_attn_bwd.cc
Lines changed: 92 additions & 0 deletions b/‎csrc/generation/flash_attn_bwd.cc
Lines changed: 92 additions & 0 deletions
diff --git a/‎csrc/setup_cuda.py
Lines changed: 1 addition & 0 deletions b/‎csrc/setup_cuda.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎llm/llama/run_trainer_tp2cp2.sh
Lines changed: 88 additions & 0 deletions b/‎llm/llama/run_trainer_tp2cp2.sh
Lines changed: 88 additions & 0 deletions
diff --git a/‎llm/run_pretrain.py
Lines changed: 4 additions & 0 deletions b/‎llm/run_pretrain.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎paddlenlp/trainer/trainer.py
Lines changed: 13 additions & 2 deletions b/‎paddlenlp/trainer/trainer.py
Lines changed: 13 additions & 2 deletions
@@ -0,0 +1,92 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//     http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/extension.h"
+#include <iostream>
+#include <vector>
+
+using paddle::Tensor;
+
+namespace paddle {
+namespace experimental {
+
+PADDLE_API void flash_attn_grad(const Tensor& q, 
+                                const Tensor& k, 
+                                const Tensor& v, 
+                                const Tensor& out, 
+                                const Tensor& softmax_lse, 
+                                const Tensor& seed_offset, 
+                                const paddle::optional<Tensor> &attn_mask,
+                                const Tensor& out_grad, 
+                                float dropout, 
+                                bool causal, Tensor* q_grad, Tensor* k_grad, Tensor* v_grad);
+
+}
+} // namespace paddle
+
+
+
+std::vector<Tensor> SRFlashAttnBwd(const Tensor &q, 
+                                const Tensor &k,
+                                const Tensor &v, 
+                                const Tensor &out,
+                                const Tensor &softmax_lse,
+                                const Tensor &seed_offset,
+                                const paddle::optional<Tensor> &attn_mask,
+                                const Tensor &out_grad, 
+				                float dropout,
+                                bool causal);
+
+
+std::vector<Tensor> SRFlashAttnBwd(const Tensor &q, 
+                                const Tensor &k,
+                                const Tensor &v, 
+                                const Tensor &out,
+                                const Tensor &softmax_lse,
+                                const Tensor &seed_offset,
+                                const paddle::optional<Tensor> &attn_mask,
+                                const Tensor &out_grad, 
+				                float dropout,
+                                bool causal){
+    std::vector<Tensor> res(3);
+    paddle::experimental::flash_attn_grad(q, k, v, out, softmax_lse, seed_offset, attn_mask,
+                                        out_grad, dropout, causal, &res[0], &res[1],
+                                        &res[2]);
+    return res;
+}
+
+
+
+std::vector<paddle::DataType> SRFlashAttnBwdDtype(paddle::DataType q_dtype,
+                                            paddle::DataType k_dtype,
+                                            paddle::DataType v_dtype) {
+  return {q_dtype, k_dtype, v_dtype};
+
+}
+
+
+std::vector<std::vector<int64_t>> SRFlashAttnBwdInferShape(
+    std::vector<int64_t> q_shape, std::vector<int64_t> k_shape,
+    std::vector<int64_t> v_shape) {
+    return {q_shape, k_shape, v_shape};
+}
+
+
+PD_BUILD_OP(flash_attn_bwd)
+    .Inputs({"q", "k", "v", "out", "softmax_lse", "seed_offset", "attn_mask", "out_grad"})
+    .Outputs({"q_grad", "k_grad", "v_grad"})
+    .Attrs({"dropout: float", "causal: bool"})
+    .SetKernelFn(PD_KERNEL(SRFlashAttnBwd))
+    .SetInferShapeFn(PD_INFER_SHAPE(SRFlashAttnBwdInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(SRFlashAttnBwdDtype));
@@ -78,6 +78,7 @@ def get_gencode_flags():
             "./generation/step.cu",
             "./generation/quant_int8.cu",
             "./generation/dequant_int8.cu",
+            "./generation/flash_attn_bwd.cc",
         ],
         extra_compile_args={
             "cxx": ["-O3"],
 
@@ -0,0 +1,88 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+set -x
+unset CUDA_VISIBLE_DEVICES
+
+rm -rf log
+rm -rf output
+
+unset PADDLE_ELASTIC_JOB_ID
+unset PADDLE_TRAINER_ENDPOINTS
+unset DISTRIBUTED_TRAINER_ENDPOINTS
+unset FLAGS_START_PORT
+unset PADDLE_ELASTIC_TIMEOUT
+
+# export FLAGS_embedding_deterministic=1
+# export FLAGS_cudnn_deterministic=1
+# export FLAGS_flash_attn_version=v1
+# export USE_FAST_LN=0
+
+
+max_seq_length=1024
+
+master=127.0.0.1
+port=36677
+
+max_steps=10000
+log_dir=seq_${max_seq_length}_log
+echo "log_dir:${log_dir}"
+rm -rf $log_dir
+
+export PYTHONPATH=../../:$PYTHONPATH
+python -u  -m paddle.distributed.launch \
+    --master $master:$port \
+    --gpus "3,4,5,7" \
+    --log_dir "./$log_dir" \
+    run_pretrain.py \
+    --model_name_or_path "facebook/llama-7b" \
+    --tokenizer_name_or_path "facebook/llama-7b" \
+    --input_dir "./data" \
+    --output_dir "./output" \
+    --split 949,50,1 \
+    --max_seq_length $max_seq_length \
+    --per_device_train_batch_size 1 \
+    --gradient_accumulation_steps 4 \
+    --per_device_eval_batch_size 4 \
+    --bf16  \
+    --fp16_opt_level "O2"  \
+    --use_flash_attention 1 \
+    --virtual_pp_degree 1 \
+    --pp_recompute_interval 1 \
+    --learning_rate 0.00001 \
+    --min_learning_rate 0.000001 \
+    --max_steps $max_steps \
+    --weight_decay 0.01 \
+    --warmup_ratio 0.01 \
+    --max_grad_norm 1.0 \
+    --logging_steps 1 \
+    --dataloader_num_workers 1 \
+    --eval_steps 1001 \
+    --disable_tqdm true \
+    --continue_training 0 \
+    --do_train \
+    --device "gpu" \
+    --enable_linear_fused_grad_add false \
+    --recompute_use_reentrant true \
+    --data_cache "./data_cache" \
+    --pipeline_parallel_degree 1 \
+    --cp_parallel_degree 2 \
+    --tensor_parallel_degree 2 \
+    --sequence_parallel false \
+    --skip_profile_timer true \
+    --amp_master_grad \
+    --report_to "visualdl" \
+    --logging_dir "./visualdl_log" \
+    --save_steps 2000000 \
@@ -485,11 +485,15 @@ def main():
     config.attention_probs_dropout_prob = model_args.attention_probs_dropout_prob
 
     config.sep_parallel_degree = training_args.sep_parallel_degree
+    config.cp_parallel_degree = training_args.cp_parallel_degree
     if config.sequence_parallel:
         assert config.tensor_parallel_degree > 1, "tensor_parallel_degree must be larger than 1 for sequence parallel."
     assert (
         config.num_attention_heads % config.sep_parallel_degree == 0
     ), f"num_attention_heads:{config.num_attention_heads} must be divisible by sep_parallel_degree {config.sep_parallel_degree}"
+    assert (
+        config.seq_length % config.cp_parallel_degree == 0
+    ), f"seq_length:{config.seq_length} must be divisible by cp_parallel_degree {config.cp_parallel_degree}"
 
     if get_env_device() == "xpu" and training_args.gradient_accumulation_steps > 1:
         try:
 
@@ -81,6 +81,7 @@
     from ..quantization.quantization_linear import QuantizationLinear
 except:
     QuantizationLinear = None
+from ..transformers.context_parallel_utils import split_inputs_sequence_dim_load_balance
 from ..transformers.model_utils import (
     PretrainedModel,
     _add_variant,
@@ -763,6 +764,8 @@ def train(
                 trainable_numel = int(trainable_numel_tensor.item()) // self.args.dataset_world_size
                 if self.args.sep_parallel_degree > 0:
                     trainable_numel = trainable_numel // self.args.sep_parallel_degree
+                if self.args.cp_parallel_degree > 0:
+                    trainable_numel = trainable_numel // self.args.cp_parallel_degree
                 # the numel is roughly, because the tensor parallel still hold own bias or layer_norm weight without splited
                 # so, the trainable numel is a little bigger than real.
                 logger.debug(f"  Number of trainable parameters = {trainable_numel:,} (all devices, roughly)")
@@ -897,6 +900,8 @@ def _inner_training_loop(
             for step, inputs in enumerate(epoch_iterator):
                 if self.args.use_hybrid_parallel and self.args.sep_parallel_degree > 1:
                     inputs = split_inputs_sequence_dim(inputs)
+                if self.args.use_hybrid_parallel and self.args.cp_parallel_degree > 1:
+                    inputs = split_inputs_sequence_dim_load_balance(inputs)
                 self.timers and self.timers("read-data").stop()
                 os.environ["TRAINER_GLOBAL_STEP"] = str(self.state.global_step)
                 self.callback_handler.on_load_data_end(args, self.state, self.control, inputs=inputs)
@@ -1006,7 +1011,11 @@ def _inner_training_loop(
                                 assert reshard_util.is_sharding_opt(self.optimizer)
                                 self.optimizer._inner_opt.reduce_gradients(list(parameters_list), self.optimizer._hcg)
 
-                            if self.optimizer._dp_enable or getattr(self.optimizer, "_sep_enable", False):
+                            if (
+                                self.optimizer._dp_enable
+                                or getattr(self.optimizer, "_sep_enable", False)
+                                or getattr(self.optimizer, "_cp_enable", False)
+                            ):
                                 fused_allreduce_gradients(list(parameters_list), self.optimizer._hcg)
 
                     self.timers and self.timers("all-reduce").stop()
@@ -1733,6 +1742,7 @@ def _wrap_model(self, model, training=True):
         in_sharding_parallel_mode = self.sharding is not None
         in_tensor_parallel_mode = self.args.tensor_parallel_degree > 1
         in_sep_parallel_mode = self.args.sep_parallel_degree > 1
+        in_cp_parallel_mode = self.args.cp_parallel_degree > 1
 
         # Multi-gpu training
         if (
@@ -1743,6 +1753,7 @@ def _wrap_model(self, model, training=True):
                 or in_sharding_parallel_mode
                 or in_tensor_parallel_mode
                 or in_sep_parallel_mode
+                or in_cp_parallel_mode
             )
         ):
             model = paddle.DataParallel(model)
@@ -1870,7 +1881,7 @@ def get_expected_keys(inputs, keys):
         if (
             not in_pipeline_parallel_mode
             and not in_sharding_parallel_mode
-            and (in_tensor_parallel_mode or in_sep_parallel_mode)
+            and (in_tensor_parallel_mode or in_sep_parallel_mode or in_cp_parallel_mode)
         ):
             if self.args.amp_master_grad:
                 mix_precision_utils.MixPrecisionLayer(model, dtype=self.amp_dtype)  # return value has no use