torchtune usecase

pbelevich · pbelevich · commit 6002abb59f36 · 2024-04-12T14:03:51.000Z
diff --git a/3.test_cases/torchtune/.gitignore b/3.test_cases/torchtune/.gitignore
@@ -0,0 +1,6 @@
+checkponts
+models
+miniconda3
+pt_torchtune
+torchtune
+Miniconda3-latest-Linux-x86_64.sh
diff --git a/3.test_cases/torchtune/0.create_conda_env.sh b/3.test_cases/torchtune/0.create_conda_env.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+set -ex
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+chmod +x Miniconda3-latest-Linux-x86_64.sh
+./Miniconda3-latest-Linux-x86_64.sh -b -f -p ./miniconda3
+
+source ./miniconda3/bin/activate
+
+conda create -y -p ./pt_torchtune python=3.10
+
+source activate ./pt_torchtune/
+
+# Install AWS Pytorch, see https://aws-pytorch-doc.com/
+# conda install -y pytorch=2.2.0 torchvision torchaudio torchtriton=2.2.0 pytorch-cuda=12.1 transformers datasets --strict-channel-priority --override-channels -c https://aws-ml-conda.s3.us-west-2.amazonaws.com -c nvidia -c conda-forge
+conda install -y pytorch torchvision torchaudio pytorch-cuda=12.1 transformers datasets -c pytorch -c nvidia
+
+git clone https://github.com/pytorch/torchtune.git
+pip install -e ./torchtune
+
+# Create checkpoint dir
+mkdir checkpoints
diff --git a/3.test_cases/torchtune/1.download_hf_model.sh b/3.test_cases/torchtune/1.download_hf_model.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+# set -ex;
+
+# Default value for HF_MODEL
+DEFAULT_HF_MODEL="meta-llama/Llama-2-7b"
+read -p "Please enter Hugging Face model ($DEFAULT_HF_MODEL): " HF_MODEL
+if [ -z "$HF_MODEL" ]; then
+    HF_MODEL="$DEFAULT_HF_MODEL"
+fi
+
+read -p "Please enter Hugging Face Access Tokens: " HF_TOKEN
+
+mkdir -p models/${HF_MODEL}
+
+tune download \
+    ${HF_MODEL} \
+    --output-dir models/${HF_MODEL} \
+    --hf-token ${HF_TOKEN}
diff --git a/3.test_cases/torchtune/2.full_finetune_distributed.sbatch b/3.test_cases/torchtune/2.full_finetune_distributed.sbatch
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+#SBATCH --nodes=1 # number of nodes to use
+#SBATCH --job-name=full_ft # name of your job
+#SBATCH --exclusive # job has exclusive use of the resource, no sharing
+
+set -ex;
+
+###########################
+###### User Variables #####
+###########################
+
+GPUS_PER_NODE=4 # 4 for G5.12x, 8 for P4/P5
+
+###########################
+## Environment Variables ##
+###########################
+
+## Plenty of EFA level variables
+## Comment out for non-efa instances (G4d, P3)
+## For G5.12x, Comment out RDMA and Fork safe
+## For G4dn and other G5, comment out all
+# export FI_EFA_USE_DEVICE_RDMA=1 # use for p4d
+# export FI_EFA_FORK_SAFE=1
+export FI_LOG_LEVEL=1
+export FI_PROVIDER=efa
+export NCCL_DEBUG=INFO
+## Switching SYNC_MEMOPS to zero can boost throughput with FSDP
+## Disables CU_POINTER_ATTRIBUTE_SYNC_MEMOPS
+## Reduces memory synchronizations
+## https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__UNIFIED.html
+export FI_EFA_SET_CUDA_SYNC_MEMOPS=0
+
+###########################
+####### Torch Dist  #######
+###########################
+
+declare -a TORCHRUN_ARGS=(
+    --nproc_per_node=$GPUS_PER_NODE \
+    --nnodes=$SLURM_JOB_NUM_NODES \
+    --rdzv_id=$SLURM_JOB_ID \
+    --rdzv_backend=c10d \
+    --rdzv_endpoint=$(hostname) \
+)
+
+export TORCHTUNE=./pt_torchtune/bin/tune
+export TRAIN_CONFIG=./llama2_7B_full.yaml
+
+srun -l ${TORCHTUNE} run "${TORCHRUN_ARGS[@]}" full_finetune_distributed --config ${TRAIN_CONFIG}
diff --git a/3.test_cases/torchtune/llama2_7B_full.yaml b/3.test_cases/torchtune/llama2_7B_full.yaml
@@ -0,0 +1,77 @@
+# Config for multi-device full finetuning in full_finetune_distributed.py
+# using a Llama2 7B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download meta-llama/Llama-2-7b \
+#   --hf-token <HF_TOKEN> \
+#   --output-dir /tmp/llama2
+#
+# To launch on 4 devices, run the following command from root:
+#   tune run --nproc_per_node 4 full_finetune_distributed \
+#   --config llama2/7B_full \
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed \
+#   --config llama2/7B_full \
+#   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# Single device full finetuning requires more memory optimizations. It's
+# best to use 7B_full_single_device.yaml for those cases
+
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.llama2.llama2_tokenizer
+  path: models/meta-llama/Llama-2-7b/tokenizer.model
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_dataset
+  train_on_input: True
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+  _component_: torchtune.models.llama2.llama2_7b
+
+checkpointer:
+  _component_: torchtune.utils.FullModelMetaCheckpointer
+  checkpoint_dir: models/meta-llama/Llama-2-7b
+  checkpoint_files: [consolidated.00.pth]
+  recipe_checkpoint: null
+  output_dir: models/meta-llama/Llama-2-7b
+  model_type: LLAMA2
+resume_from_checkpoint: False
+
+# Fine-tuning arguments
+batch_size: 2
+epochs: 3
+optimizer:
+  _component_: torch.optim.AdamW
+  lr: 2e-5
+loss:
+  _component_: torch.nn.CrossEntropyLoss
+max_steps_per_epoch: null
+gradient_accumulation_steps: 1
+
+
+# Training env
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: True
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+  _component_: torchtune.utils.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/alpaca-llama2-finetune
+log_every_n_steps: null