diff --git a/docs/fine-tuning/finetune_ipex.md b/docs/fine-tuning/finetune_ipex.md new file mode 100644 index 0000000..30f35d1 --- /dev/null +++ b/docs/fine-tuning/finetune_ipex.md @@ -0,0 +1,60 @@ +# How to Run DLSA Fine-Tuning with IPEX(FP32, BF16) + +## Support Matrix + +| Categoty | Script | +| ------------------- | --------------- | +| IPEX Single Instance | ft_ipex.sh | +| IPEX Multi Instances | ft_ipex_ccl.sh | + +## Single Instance Fine-Tuning + +``` +./fine-tuning/ft_ipex.sh +``` + +By default, it will launch 1 instance to run fine-tuning with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script. + +Below is the help message by using the command `./fine-tuning/ft_ipex.sh -h`: + +```markdown +Usage: ./fine-tuning/ft_ipex.sh [OPTIONS] +OPTION includes: + -l | --log_name - the log name of this round + -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET + -b | --batch_size - batch size per instance + -s | --sequence_len - max sequence length + --dtype_ft - data type used for fine-tuning + --train_epoch - train epoch + -h | --help - displays this message +``` +## Multi Instances Fine-Tuning + + +### Running single instance + +``` +bash fine-tuning/run_dist.sh -np 1 -ppn 1 bash fine-tuning/ft_ipex_ccl.sh +``` + +By default, it will launch 1 instance to run fine-tuning with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script. + +### Running multi instances + +``` +bash fine-tuning/run_dist.sh -np 2 -ppn 2 bash fine-tuning/ft_ipex_ccl.sh +``` + +By default, it will launch 2 instances on single node to run fine-tuning with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script. + +> Note: +> +> np: num process, means how many processes you will run on a cluster +> +> ppn: process per node, means how many processes you will run on 1 worker node. +> +> For example, if I want to run on 2 nodes, each node runs with 1 process, use the config `-np 2 -ppn 1` +> +> if I want to run on 4 nodes, each node runs with 2 processes, use the config `-np 8 -ppn 2` +> +> You can also use `-l $log_name` after `run_dist.sh` to set the log name. diff --git a/docs/fine-tuning/finetune_tpp.md b/docs/fine-tuning/finetune_tpp.md new file mode 100644 index 0000000..6bbcd02 --- /dev/null +++ b/docs/fine-tuning/finetune_tpp.md @@ -0,0 +1,60 @@ +# How to Run DLSA Fine-Tuning with IPEX(FP32, BF16) + +## Support Matrix + +| Categoty | Script | +| ------------------- | -------------- | +| TPP Single Instance | ft_tpp.sh | +| TPP Multi Instances | ft_tpp_ccl.sh | + +## Single Instance Fine-Tuning + +``` +./fine-tuning/ft_ipex.sh +``` + +By default, it will launch 1 instance to run fine-tuning with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script. + +Below is the help message by using the command `./fine-tuning/ft_ipex.sh -h`: + +```markdown +Usage: ./fine-tuning/ft_ipex.sh [OPTIONS] +OPTION includes: + -l | --log_name - the log name of this round + -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET + -b | --batch_size - batch size per instance + -s | --sequence_len - max sequence length + --dtype_ft - data type used for fine-tuning + --train_epoch - train epoch + -h | --help - displays this message +``` +## Multi Instances Fine-Tuning + + +### Running single instance + +``` +bash fine-tuning/run_dist.sh -np 1 -ppn 1 bash fine-tuning/ft_tpp_ccl.sh +``` + +By default, it will launch 1 instance to run fine-tuning with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script. + +### Running multi instances + +``` +bash fine-tuning/run_dist.sh -np 2 -ppn 2 bash fine-tuning/ft_tpp_ccl.sh +``` + +By default, it will launch 2 instances on single node to run fine-tuning with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script. + +> Note: +> +> np: num process, means how many processes you will run on a cluster +> +> ppn: process per node, means how many processes you will run on 1 worker node. +> +> For example, if I want to run on 2 nodes, each node runs with 1 process, use the config `-np 2 -ppn 1` +> +> if I want to run on 4 nodes, each node runs with 2 processes, use the config `-np 8 -ppn 2` +> +> You can also use `-l $log_name` after `run_dist.sh` to set the log name. diff --git a/docs/fine-tuning/finetune_trainer.md b/docs/fine-tuning/finetune_trainer.md new file mode 100644 index 0000000..5902a8e --- /dev/null +++ b/docs/fine-tuning/finetune_trainer.md @@ -0,0 +1,22 @@ +# How to Run DLSA Single Node Fine-Tuning with HF Trainer(FP32, BF16) + +## Single instance Fine-Tuning + +``` +./fine-tuning/ft_trainer.sh +``` + +By default, it will launch 1 instance to run fine-tuning with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script. + +Below is the help message by using the command `./fine-tuning/ft_trainer.sh -h`: + +```markdown +Usage: ./fine-tuning/ft_trainer.sh [OPTIONS] +OPTION includes: + -l | --log_name - the log name of this round + -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET + -b | --batch_size - batch size per instance + -s | --sequence_len - max sequence length + --dtype_ft - data type used for fine-tuning + -h | --help - displays this message +``` diff --git a/docs/fine-tuning/multi-nodes-ipex.md b/docs/fine-tuning/multi-nodes-ipex.md deleted file mode 100644 index a51558e..0000000 --- a/docs/fine-tuning/multi-nodes-ipex.md +++ /dev/null @@ -1,50 +0,0 @@ -# How to Run DLSA Multi Instance Fine-Tuning with IPEX (FP32, BF16) - -## Install MPI library: - -Install MPI from [here]( https://anaconda.org/intel/impi_rt ) - -MPI is included in the Intel OneAPI Toolkit. It's recommended to use the package manager to install. - -> Note: This step should be operated on all the work nodes - -## To run: - -``` -source /opt/intel/oneapi/mpi/latest/env/vars.sh -cd profiling-transformers -``` - -> Note: -> -> np: num process, means how many processes you will run on a cluster -> -> ppn: process per node, means how many processes you will run on 1 worker node. -> -> For example, if I want to run on 2 nodes, each node runs with 1 process, use the config `-np 2 -ppn 1` -> -> if I want to run on 4 nodes, each node runs with 2 processes, use the config `-np 8 -ppn 2` - -### Running single process in single node - -``` -bash fine-tuning/run_dist.sh -np 1 -ppn 1 bash fine-tuning/run_ipex_native.sh -``` - -### Running multi instances in single node - -``` -# Run 2 instances in single node -bash fine-tuning/run_dist.sh -np 2 -ppn 2 bash fine-tuning/run_ipex_native.sh -``` - -### Running with IPEX BF16 - -> Before you run BF16 fine-tuning, you need to verify whether your server supports BF16. (Only Copper Lake & Sapphire Rapids CPUs support BF16) - -add `--bf16_ipex_ft` at the end of the command: - -``` -bash fine-tuning/run_dist.sh -np 2 -ppn 2 bash fine-tuning/run_ipex_native.sh --bf16_ipex_ft 1 -``` - diff --git a/docs/fine-tuning/multi-nodes-stock-pytorch.md b/docs/fine-tuning/multi-nodes-stock-pytorch.md deleted file mode 100644 index 10e4320..0000000 --- a/docs/fine-tuning/multi-nodes-stock-pytorch.md +++ /dev/null @@ -1,44 +0,0 @@ -# How to Run DLSA Multi Node Fine-Tuning with Stock PyTorch(FP32) - -## Install MPI library: - -Install MPI from [here]( https://anaconda.org/intel/impi_rt ) - - -MPI is included in the Intel OneAPI Toolkit. It's recommended to use the package manager to install. - -> Note: This step should be operated on all the work nodes - -## To run: - -``` -source /opt/intel/oneapi/mpi/latest/env/vars.sh -cd profiling-transformers -``` - -> Note: -> -> np: num process, means how many processes you will run on a cluster -> -> ppn: process per node, means how many processes you will run on 1 worker node. -> -> For example, if I want to run on 2 nodes, each node runs with 1 process, use the config `-np 2 -ppn 1` -> -> if I want to run on 4 nodes, each node runs with 2 processes, use the config `-np 8 -ppn 2` - -### Running single process in single node - -``` -bash fine-tuning/run_dist.sh -np 1 -ppn 1 bash fine-tuning/run_ipex_native.sh -``` - -### Running multi-node fine-tuning - -> You need to create the `hostfile` which contains all nodes you want to run on and set password-free login. - -``` -bash fine-tuning/run_dist.sh -np 2 -ppn 1 -f hostfile bash fine-tuning/run_ipex_native.sh -``` - - - diff --git a/docs/fine-tuning/single-node-ipex.md b/docs/fine-tuning/single-node-ipex.md deleted file mode 100644 index 4baf9cd..0000000 --- a/docs/fine-tuning/single-node-ipex.md +++ /dev/null @@ -1,28 +0,0 @@ -# How to Run DLSA Single Node Fine-Tuning with IPEX(FP32, BF16) - -## Running on CPU - -### Single node - -``` -./fine-tuning/train_native.sh -``` - -By default, it will launch 1 instance to run fine-tuning with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script. - -Below is the help message by using the command `./fine-tuning/train_native.sh -h`: - -```markdown -Usage: ./fine-tuning/train_native.sh [OPTIONS] -OPTION includes: - -l | --log_name - the log name of this round - -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET - -b | --batch_size - batch size per instance - -s | --sequence_len - max sequence length - --bf16_ipex_ft - wether to use bf16_ipex_ft precision - --fp32_ipex_ft - wether to use fp32_ipex_ft precision - -h | --help - displays this message -``` - - - diff --git a/docs/fine-tuning/single-node-stock-pytorch.md b/docs/fine-tuning/single-node-stock-pytorch.md deleted file mode 100644 index 8b5f42f..0000000 --- a/docs/fine-tuning/single-node-stock-pytorch.md +++ /dev/null @@ -1,26 +0,0 @@ -# How to Run DLSA Single Node Fine-Tuning Pipeline with Stock PyTorch - -## Running on CPU - -### Single node - -``` -./fine-tuning/train_native.sh -``` - -By default, it will launch 1 instance to run fine-tuning with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script. - -Below is the help message by using the command `./fine-tuning/train_native.sh -h`: - -```markdown -Usage: ./fine-tuning/train_native.sh [OPTIONS] -OPTION includes: - -l | --log_name - the log name of this round - -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET - -b | --batch_size - batch size per instance - -s | --sequence_len - max sequence length - ~~--bf16_ipex_ft - wether to use bf16_ipex_ft precision~~ - ~~--fp32_ipex_ft - wether to use fp32_ipex_ft precision~~ - -h | --help - displays this message -``` - diff --git a/docs/fine-tuning/single-node-trainer.md b/docs/fine-tuning/single-node-trainer.md deleted file mode 100644 index 11da772..0000000 --- a/docs/fine-tuning/single-node-trainer.md +++ /dev/null @@ -1,28 +0,0 @@ -# How to Run DLSA Single Node Fine-Tuning with Trainer(FP32, BF16) - -## Running on CPU - -### Single node - -``` -./fine-tuning/train_trainer.sh -``` - -By default, it will launch 1 instance to run fine-tuning with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script. - -Below is the help message by using the command `./fine-tuning/train_native.sh -h`: - -```markdown -Usage: ./fine-tuning/train_trainer.sh [OPTIONS] -OPTION includes: - -l | --log_name - the log name of this round - -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET - -b | --batch_size - batch size per instance - -s | --sequence_len - max sequence length - --bf16 - whether using hf bf16 inference - --use_ipex - whether using ipex - -h | --help - displays this message -``` - - - diff --git a/docs/index.md b/docs/index.md index 61f8f2d..a2c0932 100644 --- a/docs/index.md +++ b/docs/index.md @@ -4,30 +4,19 @@ DLSA is Intel optimized representative End-to-end Fine-Tuning & Inference pipeli ![Image](assets/images/DLSA_workflow.PNG) -## Prerequisites -### Download the repo +## Run on bare-metal + +### Prerequisites +#### Download the repo ``` #download the repo -git clone https://github.com/intel/document-level-sentiment-analysis.git +git clone https://github.com/intel-innersource/frameworks.ai.end2end-ai-pipelines.dlsa.git cd frameworks.ai.end2end-ai-pipelines.dlsa/profiling-transformers -git checkout v1.0.0 -``` - -### Download the datasets: - -``` -mkdir datasets -cd datasets -#download and extract SST-2 dataset -wget https://dl.fbaipublicfiles.com/glue/data/SST-2.zip && unzip SST-2.zip && mv SST-2 sst -#download and extract IMDB dataset -wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz && tar -zxf aclImdb_v1.tar.gz ``` ->Note: Make sure the network connections work well for downloading the datasets. -## Deploy the test environment -### Download Miniconda and install it +### Deploy the test environment (Bare-metal) +#### Download Miniconda and install it ``` wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh @@ -36,7 +25,7 @@ sh Miniconda3-latest-Linux-x86_64.sh > Note: If you have already installed conda on your system, just skip this step. -### Prepare the conda environment for DLSA +#### Prepare the conda environment for DLSA ``` conda create -n dlsa python=3.8 --yes @@ -44,26 +33,30 @@ conda activate dlsa sh install.sh ``` -## Running DLSA Inference Pipeline +### Running DLSA Inference Pipeline -| Implementations | Model | API | Framework | Precision | -| -------------------------------------------------------- | -------- | ----------- | -------------- | -------------- | -| [Run with HF Transformers](inference/hf-transformers.md) | HF Model | Trainer | PyTorch + IPEX | FP32,BF16 | -| [Run with Stock Pytorch](inference/stock-pytorch.md) | HF Mode | Non-trainer | PyTorch | FP32 | -| [Run with IPEX](inference/ipex.md) | HF Mode | Non-trainer | PyTorch + IPEX | FP32,BF16,INT8 | +| Implementations | Model | Instance | Framework | Precision | +| -------------------------------------------- | -------- | ----------- | -------------- | -------------- | +| [HF Trainer](inference/infer_trainer.md) | HF Model | Single/Multi | PyTorch + IPEX | FP32,BF16 | +| [IPEX](inference/infer_ipex.md) | HF Model | Single/Multi | PyTorch + IPEX | FP32,BF16,INT8 | -## Running DLSA Fine-Tuning Pipeline +### Running DLSA Fine-Tuning Pipeline -### Single Node Fine-Tuning -| Implementations | Model | Instance | API | Framework | Precision | -| ---------------------------------- | -------- | -------- | ----------- | ----------------------- | ---------- | -| [Run with HF Transformers + IPEX ](fine-tuning/single-node-trainer.md) | HF Model | Single | Trainer | PyTorch + IPEX | FP32, BF16 | -| [Run with Stock Pytorch](fine-tuning/single-node-stock-pytorch.md) | HF Model | Single | Non-trainer | PyTorch | FP32 | -| [Run with IPEX (Single Instance)](fine-tuning/single-node-ipex.md) | HF Model | Single | Non-trainer | PyTorch + IPEX | FP32,BF16 | -| [Run with IPEX (Multi Instance)](fine-tuning/multi-nodes-ipex.md) | HF Model | Multiple | Non-trainer | PyTorch + IPEX | FP32,BF16 | +| Implementations | Model | Instance | Framework | Precision | +| --------------------------------------------- | --------- | ----------- | --------------- | ---------- | +| [HF Trainer](fine-tuning/finetune_trainer.md) | HF Model | Single | PyTorch + IPEX | FP32, BF16 | +| [IPEX](fine-tuning/finetune_ipex.md) | HF Model | Single/Multi | PyTorch + IPEX | FP32,BF16 | +| [TPP](fine-tuning/finetune_tpp.md) | HF Model | Single/Multi | PyTorch + TPP | FP32,BF16 | + + + +## Run on docker + +Please follow the directions from [intel ai-workflows](https://github.com/intel/ai-workflows/tree/main/language_modeling/pytorch/bert_large/training) to run DLSA on docker. ## Issue Tracking -E2E DLSA tracks both bugs and enhancement requests using [Github](https://github.com/intel/document-level-sentiment-analysis/issues). We welcome input, however, before filing a request, please make sure you do the following: +E2E DLSA tracks both bugs and enhancement requests using [Github](https://github.com/intel-innersource/frameworks.ai.end2end-ai-pipelines.dlsa/issues). We welcome input, however, before filing a request, please make sure you do the following: + Search the Github issue database. diff --git a/docs/inference/ipex.md b/docs/inference/infer_ipex.md similarity index 61% rename from docs/inference/ipex.md rename to docs/inference/infer_ipex.md index e91c5df..78513ad 100644 --- a/docs/inference/ipex.md +++ b/docs/inference/infer_ipex.md @@ -4,8 +4,8 @@ | Categoty | Script | | ------------------- | ------------------ | -| CPU Single Instance | single_instance.sh | -| CPU Multi Instances | multi_instance.sh | +| IPEX Single Instance | inf_ipex_single.sh | +| IPEX Multi Instances | inf_ipex_multi.sh | > Note: Please use the fine-tuned model for correct accuracy. Just change the `MODEL_NAME_OR_PATH` in the script before you running. By default, the `MODEL_NAME_OR_PATH` is `bert-large-uncased` which is downloaded from the Hugging Face website. @@ -16,24 +16,21 @@ ### Single instance ``` -./inference/single_instance.sh +./inference/inf_ipex_single.sh ``` By default, it will launch 1 instance to run inference with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script. -Below is the help message by using the command `./inference/single_instance.sh -h`: +Below is the help message by using the command `./inference/inf_ipex_single.sh -h`: ```markdown -Usage: ./inference/single_instance.sh [OPTIONS] +Usage: ./inference/inf_ipex_single.sh [OPTIONS] OPTION includes: -l | --log_name - the log name of this round - -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET + -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET -b | --batch_size - batch size per instance -s | --sequence_len - max sequence length - --ipex_fp32 - wether to use ipex_fp32 precision - --ipex_bf16 - wether to use ipex_bf16 precision - --int8 - wether to use int8 precision - --int8_bf16 - wether to use int8_bf16 precision + --dtype_inf - data type used for inference -h | --help - displays this message ``` @@ -42,24 +39,21 @@ OPTION includes: ### Multi-instance ``` -./inference/multi_instance.sh +./inference/inf_ipex_multi.sh ``` By default, it will launch 2 instances (1 instance/socket) to run inference with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script. -Below is the help message by using the command `./inference/multi_instance.sh -h` +Below is the help message by using the command `./inference/inf_ipex_multi.sh -h` ```markdown -Usage: ./inference/multi_instance.sh [OPTIONS] +Usage: ./inference/inf_ipex_multi.sh [OPTIONS] OPTION includes: -l | --log_name - the log name of this round - -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET + -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET -n | --num_of_ins_per_socket - number of instance per socket -b | --batch_size - batch size per instance -s | --sequence_len - max sequence length - --ipex_fp32 - wether to use ipex_fp32 precision - --ipex_bf16 - wether to use ipex_bf16 precision - --int8 - wether to use int8 precision - --int8_bf16 - wether to use int8_bf16 precision + --dtype_inf - data type used for inference -h | --help - displays this message ``` diff --git a/docs/inference/hf-transformers.md b/docs/inference/infer_trainer.md similarity index 62% rename from docs/inference/hf-transformers.md rename to docs/inference/infer_trainer.md index cf73841..8354086 100644 --- a/docs/inference/hf-transformers.md +++ b/docs/inference/infer_trainer.md @@ -4,8 +4,8 @@ |Categoty | Script | |---|---| -|CPU Single Instance | cpu_single_instance.sh | -|CPU Multi Instances | cpu_multi_instance.sh | +|CPU Single Instance | inf_trainer_single.sh | +|CPU Multi Instances | inf_trainer_multi.sh | > Note: Please use the fine-tuned model for correct accuracy. Just change the `MODEL_NAME_OR_PATH` in the script before you running. By default, the `MODEL_NAME_OR_PATH` is `bert-large-uncased` which is downloaded from the Hugging Face website. @@ -14,22 +14,21 @@ ### Single instance ``` -./inference/cpu_single_instance.sh +./inference/inf_trainer_single.sh ``` By default, it will launch 1 instance to run inference with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script. -Below is the help message by using the command `./inference/cpu_single_instance.sh -h`: +Below is the help message by using the command `./inference/inf_trainer_single.sh -h`: ```markdown -Usage: ./inference/cpu_single_instance.sh [OPTIONS] +Usage: ./inference/inf_trainer_single.sh [OPTIONS] OPTION includes: -l | --log_name - the log name of this round - -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET + -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET -b | --batch_size - batch size per instance -s | --sequence_len - max sequence length - --bf16 - whether using hf bf16 inference - --use_ipex - whether using ipex + --dtype_inf - data type used for inference -h | --help - displays this message ``` @@ -38,22 +37,21 @@ OPTION includes: ### Multi-instance ``` -./inference/cpu_multi_instance.sh +./inference/inf_trainer_multi.sh ``` By default, it will launch 2 instances (1 instance/socket) to run inference with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script. -Below is the help message by using the command `./inference/cpu_multi_instance.sh -h` +Below is the help message by using the command `./inference/inf_trainer_multi.sh -h` ```markdown -Usage: ./inference/cpu_multi_instance.sh [OPTIONS] +Usage: ./inference/inf_trainer_multi.sh [OPTIONS] OPTION includes: -l | --log_name - the log name of this round - -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET + -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET -n | --num_of_ins_per_socket - number of instance per socket -b | --batch_size - batch size per instance -s | --sequence_len - max sequence length - --bf16 - whether using hf bf16 inference - --use_ipex - whether using ipex + --dtype_inf - data type used for inference -h | --help - displays this message ``` diff --git a/docs/inference/stock-pytorch.md b/docs/inference/stock-pytorch.md deleted file mode 100644 index 2525fb6..0000000 --- a/docs/inference/stock-pytorch.md +++ /dev/null @@ -1,64 +0,0 @@ -# How to Run DLSA Inference Pipeline with Stock PyTorch - -## Support Matrix - -|Categoty | Script | -|---|---| -|CPU Single Instance | single_instance.sh | -|CPU Multi Instances | multi_instance.sh | - -> Note: Please use the fine-tuned model for correct accuracy. Just change the `MODEL_NAME_OR_PATH` in the script before you running. By default, the `MODEL_NAME_OR_PATH` is `bert-large-uncased` which is downloaded from the Hugging Face website. - -## Running on CPU - -### Single instance - -``` -./inference/single_instance.sh -``` - -By default, it will launch 1 instance to run inference with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script. - -Below is the help message by using the command `./inference/single_instance.sh -h`: - -```markdown -Usage: ./inference/single_instance.sh [OPTIONS] -OPTION includes: - -l | --log_name - the log name of this round - -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET - -b | --batch_size - batch size per instance - -s | --sequence_len - max sequence length - ~~--ipex_fp32 - wether to use ipex_fp32 precision~~ - ~~--ipex_bf16 - wether to use ipex_bf16 precision~~ - ~~--int8 - wether to use int8 precision~~ - ~~--int8_bf16 - wether to use int8_bf16 precision~~ - -h | --help - displays this message -``` - - - -### Multi-instance - -``` -./inference/multi_instance.sh -``` - -By default, it will launch 2 instances (1 instance/socket) to run inference with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script. - -Below is the help message by using the command `./inference/multi_instance.sh -h` - -```markdown -Usage: ./inference/multi_instance.sh [OPTIONS] -OPTION includes: - -l | --log_name - the log name of this round - -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET - -n | --num_of_ins_per_socket - number of instance per socket - -b | --batch_size - batch size per instance - -s | --sequence_len - max sequence length - ~~--ipex_fp32 - wether to use ipex_fp32 precision~~ - ~~--ipex_bf16 - wether to use ipex_bf16 precision~~ - ~~--int8 - wether to use int8 precision~~ - ~~--int8_bf16 - wether to use int8_bf16 precision~~ - -h | --help - displays this message -``` - diff --git a/profiling-transformers/deploy/install_torch_ccl.sh b/profiling-transformers/deploy/install_torch_ccl.sh index 3fa177d..b28e20b 100755 --- a/profiling-transformers/deploy/install_torch_ccl.sh +++ b/profiling-transformers/deploy/install_torch_ccl.sh @@ -15,7 +15,6 @@ # and limitations under the License. # -# GCC_GOOD=`gcc --version | awk '/gcc/ && ($3+0)>=8.3{print "1"}'` if [ "x$GCC_GOOD" != "x1" ] ; then diff --git a/profiling-transformers/deploy/install_tpp.sh b/profiling-transformers/deploy/install_tpp.sh new file mode 100755 index 0000000..fa72d6b --- /dev/null +++ b/profiling-transformers/deploy/install_tpp.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# Copyright (C) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# + + +GCC_GOOD=`gcc --version | awk '/gcc/ && ($3+0)>=8.3{print "1"}'` +if [ "x$GCC_GOOD" != "x1" ] ; then + echo "Requires gcc version later than 8.3.0" + exit 1 +fi + +pt_version=$(python -c "import torch; print(torch.__version__)" 2> /dev/null) +if [ "x$pt_version" == "x" ] ; then + echo "Can't find pytorch version, need PyTorch 1.9 or higher..." + exit 1 +fi + + +if ! test -d ./tpp-pytorch-extension ; then + git clone https://github.com/libxsmm/tpp-pytorch-extension.git +fi +cd tpp-pytorch-extension +git submodule update --init && python setup.py install + diff --git a/profiling-transformers/fine-tuning/train_native.sh b/profiling-transformers/fine-tuning/ft_ipex.sh similarity index 55% rename from profiling-transformers/fine-tuning/train_native.sh rename to profiling-transformers/fine-tuning/ft_ipex.sh index d22fd65..1c65a4f 100755 --- a/profiling-transformers/fine-tuning/train_native.sh +++ b/profiling-transformers/fine-tuning/ft_ipex.sh @@ -1,4 +1,6 @@ -# Copyright (C) 2022 Intel Corporation +#!/bin/bash + +# Copyright (C) 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,17 +15,14 @@ # and limitations under the License. # -# - -export LOG_NAME=`date "+%m%d-%H%M"` -export DATASET="sst2" -export BATCH_SIZE=32 -export SEQUENCE_LEN=55 -export BF16_IPEX_FT=0 -export FP32_IPEX_FT=0 -export TRAIN_EPOCH=1 -export MODEL_NAME_OR_PATH="bert-large-uncased" -export OUTPUT_DIR="${OUTPUT_DIR:-./logs}" +LOG_NAME=$(date "+%m%d-%H%M") +DATASET="sst2" +BATCH_SIZE=32 +SEQUENCE_LEN=55 +DTYPE_FT="fp32" +TRAIN_EPOCH=1 +MODEL_NAME_OR_PATH="bert-large-uncased" +OUTPUT_DIR="${OUTPUT_DIR:-./logs}" while [ "$1" != "" ]; do @@ -48,36 +47,38 @@ do SEQUENCE_LEN="$1" echo "sequence_len is : $SEQUENCE_LEN" ;; - --bf16_ipex_ft ) - BF16_IPEX_FT=1 - echo "bf16_ipex_ft is : $BF16_IPEX_FT" + --dtype_ft ) + shift + DTYPE_FT="$1" + echo "dtype_ft is : $DTYPE_FT" ;; - --fp32_ipex_ft ) - FP32_IPEX_FT=1 - echo "fp32_ipex_ft is : $FP32_IPEX_FT" + --train_epoch ) + shift + TRAIN_EPOCH="$1" + echo "train_epoch is : $TRAIN_EPOCH" ;; -h | --help ) - echo "Usage: ./train_native.sh [OPTIONS]" + echo "Usage: $0 [OPTIONS]" echo "OPTION includes:" echo " -l | --log_name - the log name of this round" - echo " -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET" + echo " -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET" echo " -b | --batch_size - batch size per instance" echo " -s | --sequence_len - max sequence length" - echo " --bf16_ipex_ft - wether to use bf16_ipex_ft precision" - echo " --fp32_ipex_ft - wether to use fp32_ipex_ft precision" + echo " --dtype_ft - data type used for fine-tuning" + echo " --train_epoch - train epoch" echo " -h | --help - displays this message" exit ;; * ) echo "Invalid option: $1" - echo "Usage: train_native.sh [OPTIONS]" + echo "Usage: $0 [OPTIONS]" echo "OPTION includes:" echo " -l | --log_name - the log name of this round" - echo " -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET" + echo " -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET" echo " -b | --batch_size - batch size per instance" echo " -s | --sequence_len - max sequence length" - echo " --bf16_ipex_ft - wether to use bf16_ipex_ft precision" - echo " --fp32_ipex_ft - wether to use fp32_ipex_ft precision" + echo " --dtype_ft - data type used for fine-tuning" + echo " --train_epoch - train epoch" exit ;; esac @@ -85,30 +86,27 @@ do done if [ -z "$LOG_NAME" ]; then - pre=`date "+%m%d-%H%M"` + pre=$(date "+%m%d-%H%M") else pre=$LOG_NAME fi OUTPUT_DIR=$OUTPUT_DIR'/'$pre'/'$DATASET -echo $OUTPUT_DIR - -mkdir -p $OUTPUT_DIR +echo "$OUTPUT_DIR" +mkdir -p "$OUTPUT_DIR"/output_test export CUDA_VISIBLE_DEVICES="-1"; \ -python ./src/run_pt_native.py \ - --model_name_or_path $MODEL_NAME_OR_PATH \ - --dataset $DATASET \ - --bf16_ipex_ft $BF16_IPEX_FT \ - --fp32_ipex_ft $FP32_IPEX_FT \ - --output_dir $OUTPUT_DIR/output_test \ - --max_seq_len $SEQUENCE_LEN \ - --num_train_epochs $TRAIN_EPOCH \ - --do_train \ - --per_device_train_batch_size $BATCH_SIZE \ +python ./src/run_finetune.py \ + --model_name_or_path "$MODEL_NAME_OR_PATH" \ + --dataset "$DATASET" \ + --output_dir "$OUTPUT_DIR"/output_test \ + --finetune_impl ipex \ + --dtype_ft "$DTYPE_FT" \ + --do_train \ + --max_seq_len "$SEQUENCE_LEN" \ + --num_train_epochs "$TRAIN_EPOCH" \ + --per_device_train_batch_size "$BATCH_SIZE" \ --do_predict \ --per_device_eval_batch_size 8 \ - 2>&1 | tee $OUTPUT_DIR/test_$i.log - - + 2>&1 | tee "$OUTPUT_DIR"/test.log diff --git a/profiling-transformers/fine-tuning/ft_ipex_ccl.sh b/profiling-transformers/fine-tuning/ft_ipex_ccl.sh new file mode 100755 index 0000000..711b68f --- /dev/null +++ b/profiling-transformers/fine-tuning/ft_ipex_ccl.sh @@ -0,0 +1,96 @@ +#!/bin/bash + +# Copyright (C) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# + +DATASET="sst2" +BATCH_SIZE=32 +SEQUENCE_LEN=55 +DTYPE_FT="fp32" +TRAIN_EPOCH=1 +MODEL_NAME_OR_PATH="bert-large-uncased" + +while [ "$1" != "" ]; +do + case $1 in + -d | --dataset ) + shift + DATASET="$1" + echo "dataset is : $DATASET" + ;; + -o ) + shift + OUTPUT_DIR="$1" + echo "output_dir is : $OUTPUT_DIR" + ;; + -b | --batch_size ) + shift + BATCH_SIZE="$1" + echo "batch size per instance is : $BATCH_SIZE" + ;; + -s | --sequence_len ) + shift + SEQUENCE_LEN="$1" + echo "sequence_len is : $SEQUENCE_LEN" + ;; + --dtype_ft ) + shift + DTYPE_FT="$1" + echo "dtype_ft is : $DTYPE_FT" + ;; + --train_epoch ) + shift + TRAIN_EPOCH="$1" + echo "train_epoch is : $TRAIN_EPOCH" + ;; + -h | --help ) + echo "Usage: $0 [OPTIONS]" + echo "OPTION includes:" + echo " -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET" + echo " -b | --batch_size - batch size per instance" + echo " -s | --sequence_len - max sequence length" + echo " --dtype_ft - data type used for fine-tuning" + echo " --train_epoch - train epoch" + echo " -h | --help - displays this message" + exit + ;; + * ) + echo "Invalid option: $1" + echo "Usage: $0 [OPTIONS]" + echo "OPTION includes:" + echo " -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET" + echo " -b | --batch_size - batch size per instance" + echo " -s | --sequence_len - max sequence length" + echo " --dtype_ft - data type used for fine-tuning" + echo " --train_epoch - train epoch" + exit + ;; + esac + shift +done + +export CUDA_VISIBLE_DEVICES="-1"; \ +python ./src/run_finetune.py \ + --model_name_or_path "$MODEL_NAME_OR_PATH" \ + --dataset "$DATASET" \ + --output_dir "$OUTPUT_DIR"/output_test \ + --finetune_impl ipex_ccl \ + --dtype_ft "$DTYPE_FT" \ + --do_train \ + --max_seq_len "$SEQUENCE_LEN" \ + --num_train_epochs "$TRAIN_EPOCH" \ + --per_device_train_batch_size "$BATCH_SIZE" \ + --do_predict \ + --per_device_eval_batch_size 8 diff --git a/profiling-transformers/fine-tuning/train_trainer.sh b/profiling-transformers/fine-tuning/ft_tpp.sh similarity index 58% rename from profiling-transformers/fine-tuning/train_trainer.sh rename to profiling-transformers/fine-tuning/ft_tpp.sh index ece76b0..4a01006 100755 --- a/profiling-transformers/fine-tuning/train_trainer.sh +++ b/profiling-transformers/fine-tuning/ft_tpp.sh @@ -1,3 +1,5 @@ +#!/bin/bash + # Copyright (C) 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,17 +15,14 @@ # and limitations under the License. # -# - -export LOG_NAME=`date "+%m%d-%H%M"` -export DATASET="sst2" -export BATCH_SIZE=32 -export SEQUENCE_LEN=55 -export BF16="" -export USE_IPEX="" -export TRAIN_EPOCH=1 -export MODEL_NAME_OR_PATH="bert-large-uncased" -export OUTPUT_DIR="${OUTPUT_DIR:-./logs}" +LOG_NAME=$(date "+%m%d-%H%M") +DATASET="imdb" +BATCH_SIZE=32 +SEQUENCE_LEN=512 +DTYPE_FT="fp32" +TRAIN_EPOCH=1 +MODEL_NAME_OR_PATH="bert-large-uncased" +OUTPUT_DIR="${OUTPUT_DIR:-./logs}" while [ "$1" != "" ]; do @@ -48,36 +47,38 @@ do SEQUENCE_LEN="$1" echo "sequence_len is : $SEQUENCE_LEN" ;; - --bf16 ) - BF16="--bf16" - echo "use bf16" + --dtype_ft ) + shift + DTYPE_FT="$1" + echo "dtype_ft is : $DTYPE_FT" ;; - --use_ipex ) - USE_IPEX=1 - echo "use_ipex is : $USE_IPEX" + --train_epoch ) + shift + TRAIN_EPOCH="$1" + echo "train_epoch is : $TRAIN_EPOCH" ;; -h | --help ) - echo "Usage: ./fine-tuning/train_trainer.sh [OPTIONS]" + echo "Usage: $0 [OPTIONS]" echo "OPTION includes:" echo " -l | --log_name - the log name of this round" - echo " -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET" + echo " -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET" echo " -b | --batch_size - batch size per instance" echo " -s | --sequence_len - max sequence length" - echo " --bf16 - whether using hf bf16 inference" - echo " --use_ipex - whether using ipex" + echo " --dtype_ft - data type used for fine-tuning" + echo " --train_epoch - train epoch" echo " -h | --help - displays this message" exit ;; * ) echo "Invalid option: $1" - echo "Usage: ./fine-tuning/train_trainer.sh [OPTIONS]" + echo "Usage: $0 [OPTIONS]" echo "OPTION includes:" echo " -l | --log_name - the log name of this round" - echo " -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET" + echo " -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET" echo " -b | --batch_size - batch size per instance" echo " -s | --sequence_len - max sequence length" - echo " --bf16 - whether using hf bf16 inference" - echo " --use_ipex - whether using ipex" + echo " --dtype_ft - data type used for fine-tuning" + echo " --train_epoch - train epoch" exit ;; esac @@ -85,31 +86,29 @@ do done if [ -z "$LOG_NAME" ]; then - pre=`date "+%m%d-%H%M"` + pre=$(date "+%m%d-%H%M") else pre=$LOG_NAME fi OUTPUT_DIR=$OUTPUT_DIR'/'$pre'/'$DATASET -echo $OUTPUT_DIR - -mkdir -p $OUTPUT_DIR +echo "$OUTPUT_DIR" +mkdir -p "$OUTPUT_DIR"/output_test export CUDA_VISIBLE_DEVICES="-1"; \ -python ./src/run_pt.py \ - --model_name_or_path $MODEL_NAME_OR_PATH \ - --dataset $DATASET \ - --output_dir $OUTPUT_DIR/output_test \ - --max_seq_len $SEQUENCE_LEN \ - --num_train_epochs $TRAIN_EPOCH \ - --do_train \ - --per_device_train_batch_size $BATCH_SIZE \ +python ./src/run_finetune.py \ + --model_name_or_path "$MODEL_NAME_OR_PATH" \ + --dataset "$DATASET" \ + --output_dir "$OUTPUT_DIR"/output_test \ + --finetune_impl tpp \ + --do_train \ + --dtype_ft "$DTYPE_FT" \ + --use_tpp --unpad \ + --max_seq_len "$SEQUENCE_LEN" \ + --num_train_epochs "$TRAIN_EPOCH" \ + --per_device_train_batch_size "$BATCH_SIZE" \ --do_predict \ --per_device_eval_batch_size 8 \ - --no_cuda \ - $BF16 \ - $USE_IPEX \ - 2>&1 | tee $OUTPUT_DIR/test_$i.log - - + "$@" \ + 2>&1 | tee "$OUTPUT_DIR"/test.log diff --git a/profiling-transformers/fine-tuning/ft_tpp_ccl.sh b/profiling-transformers/fine-tuning/ft_tpp_ccl.sh new file mode 100755 index 0000000..7656aa3 --- /dev/null +++ b/profiling-transformers/fine-tuning/ft_tpp_ccl.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +# Copyright (C) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# + +DATASET="imdb" +BATCH_SIZE=32 +SEQUENCE_LEN=512 +DTYPE_FT="fp32" +TRAIN_EPOCH=1 +MODEL_NAME_OR_PATH="bert-large-uncased" + +while [ "$1" != "" ]; +do + case $1 in + -d | --dataset ) + shift + DATASET="$1" + echo "dataset is : $DATASET" + ;; + -o ) + shift + OUTPUT_DIR="$1" + echo "output_dir is : $OUTPUT_DIR" + ;; + -b | --batch_size ) + shift + BATCH_SIZE="$1" + echo "batch size per instance is : $BATCH_SIZE" + ;; + -s | --sequence_len ) + shift + SEQUENCE_LEN="$1" + echo "sequence_len is : $SEQUENCE_LEN" + ;; + --dtype_ft ) + shift + DTYPE_FT="$1" + echo "dtype_ft is : $DTYPE_FT" + ;; + --train_epoch ) + shift + TRAIN_EPOCH="$1" + echo "train_epoch is : $TRAIN_EPOCH" + ;; + -h | --help ) + echo "Usage: $0 [OPTIONS]" + echo "OPTION includes:" + echo " -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET" + echo " -b | --batch_size - batch size per instance" + echo " -s | --sequence_len - max sequence length" + echo " --dtype_ft - data type used for fine-tuning" + echo " --train_epoch - train epoch" + echo " -h | --help - displays this message" + exit + ;; + * ) + echo "Invalid option: $1" + echo "Usage: $0 [OPTIONS]" + echo "OPTION includes:" + echo " -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET" + echo " -b | --batch_size - batch size per instance" + echo " -s | --sequence_len - max sequence length" + echo " --dtype_ft - data type used for fine-tuning" + echo " --train_epoch - train epoch" + exit + ;; + esac + shift +done + +export CUDA_VISIBLE_DEVICES="-1"; \ +python ./src/run_finetune.py \ + --model_name_or_path "$MODEL_NAME_OR_PATH" \ + --dataset "$DATASET" \ + --output_dir "$OUTPUT_DIR"/output_test \ + --finetune_impl tpp_ccl \ + --do_train \ + --dtype_ft "$DTYPE_FT" \ + --max_seq_len "$SEQUENCE_LEN" \ + --num_train_epochs "$TRAIN_EPOCH" \ + --per_device_train_batch_size "$BATCH_SIZE" \ + --do_predict \ + --per_device_eval_batch_size 256 \ + --use_tpp --unpad diff --git a/profiling-transformers/fine-tuning/ft_trainer.sh b/profiling-transformers/fine-tuning/ft_trainer.sh new file mode 100755 index 0000000..c2f12ff --- /dev/null +++ b/profiling-transformers/fine-tuning/ft_trainer.sh @@ -0,0 +1,110 @@ +#!/bin/bash + +# Copyright (C) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# + +LOG_NAME=$(date "+%m%d-%H%M") +DATASET="sst2" +BATCH_SIZE=32 +SEQUENCE_LEN=55 +TRAIN_EPOCH=1 +MODEL_NAME_OR_PATH="bert-large-uncased" +OUTPUT_DIR="${OUTPUT_DIR:-./logs}" +DTYPE_FT="fp32" +APPEND="" + +while [ "$1" != "" ]; +do + case $1 in + -l | --log_name ) + shift + LOG_NAME="$1" + echo "log name is $LOG_NAME" + ;; + -d | --dataset ) + shift + DATASET="$1" + echo "dataset is : $DATASET" + ;; + -b | --batch_size ) + shift + BATCH_SIZE="$1" + echo "batch size per instance is : $BATCH_SIZE" + ;; + -s | --sequence_len ) + shift + SEQUENCE_LEN="$1" + echo "sequence_len is : $SEQUENCE_LEN" + ;; + --dtype_ft ) + shift + DTYPE_FT="$1" + echo "dtype_ft is : $DTYPE_FT" + ;; + -h | --help ) + echo "Usage: $0 [OPTIONS]" + echo "OPTION includes:" + echo " -l | --log_name - the log name of this round" + echo " -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET" + echo " -b | --batch_size - batch size per instance" + echo " -s | --sequence_len - max sequence length" + echo " --dtype_ft - data type used for fine-tuning" + echo " -h | --help - displays this message" + exit + ;; + * ) + echo "Invalid option: $1" + echo "Usage: $0 [OPTIONS]" + echo "OPTION includes:" + echo " -l | --log_name - the log name of this round" + echo " -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET" + echo " -b | --batch_size - batch size per instance" + echo " -s | --sequence_len - max sequence length" + echo " --dtype_ft - data type used for fine-tuning" + exit + ;; + esac + shift +done +if [ "$DTYPE_FT" == "bf16" ]; then + APPEND="--bf16 --use_ipex" +fi + +if [ -z "$LOG_NAME" ]; then + pre=$(date "+%m%d-%H%M") +else + pre=$LOG_NAME +fi + +OUTPUT_DIR=$OUTPUT_DIR'/'$pre'/'$DATASET +echo "$OUTPUT_DIR" + +mkdir -p "$OUTPUT_DIR"/output_test + +export CUDA_VISIBLE_DEVICES="-1"; \ +python ./src/run_finetune.py \ + --model_name_or_path "$MODEL_NAME_OR_PATH" \ + --dataset "$DATASET" \ + --output_dir "$OUTPUT_DIR"/output_test \ + --finetune_impl trainer \ + --max_seq_len "$SEQUENCE_LEN" \ + --num_train_epochs $TRAIN_EPOCH \ + --do_train \ + --per_device_train_batch_size "$BATCH_SIZE" \ + --do_predict \ + --per_device_eval_batch_size 8 \ + --no_cuda \ + "$APPEND" \ + 2>&1 | tee "$OUTPUT_DIR"/test.log diff --git a/profiling-transformers/fine-tuning/run_dist.sh b/profiling-transformers/fine-tuning/run_dist.sh index 8d2e94d..ebd1ab8 100755 --- a/profiling-transformers/fine-tuning/run_dist.sh +++ b/profiling-transformers/fine-tuning/run_dist.sh @@ -1,5 +1,6 @@ #!/bin/bash -# Copyright (C) 2022 Intel Corporation + +# Copyright (C) 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,8 +15,8 @@ # and limitations under the License. # -# +OUTPUT_DIR="${OUTPUT_DIR:-./logs}" function print_vars { for VAR in ${!CCL*} ${!I_MPI*} ${!i_mpi*} ${!KMP_*} ${!OMP_*} ${!ATL_*} LD_PRELOAD ${!DLRM_*} ${!PYTORCH_*} ${!PCL_*} ${!LIBXSMM_*} ${!EMULATE_*} DATALOADER_WORKER_COUNT VIRTUAL_ENV ${!ARGS_*} $@ ; do if ! test -z ${!VAR} ; then @@ -28,6 +29,10 @@ SINGLE_SOCKET_ONLY=0 while (( "$#" )); do case "$1" in + -l) + LOG_NAME=$2 + shift 2 + ;; -n|-np) ARGS_NTASKS=$2 shift 2 @@ -58,6 +63,17 @@ while (( "$#" )); do esac done +if [ -z "$LOG_NAME" ]; then + pre=$(date "+%m%d-%H%M") +else + pre=$LOG_NAME +fi + +OUTPUT_DIR=$OUTPUT_DIR'/'$pre'/'$DATASET +echo "$OUTPUT_DIR" + +mkdir -p "$OUTPUT_DIR"/output_test + if ! test -z $SLURM_JOB_ID ; then PREFIX="srun -n 1 -N 1 " else @@ -195,7 +211,7 @@ echo "Running mpiexec.hydra $@" echo "Start Time: `date`" SECONDS=0 #mpiexec.hydra ${MPIEXE_ARGS} ${CMD} $@ -mpiexec.hydra $@ +mpiexec.hydra $@ -o "$OUTPUT_DIR" 2>&1 | tee "$OUTPUT_DIR"/test.log echo "End Time: `date`" duration=$SECONDS echo "Total Time: $(($duration / 60)) min and $(($duration % 60)) sec" diff --git a/profiling-transformers/fine-tuning/run_ipex_native.sh b/profiling-transformers/fine-tuning/run_ipex_native.sh deleted file mode 100755 index 4e41abf..0000000 --- a/profiling-transformers/fine-tuning/run_ipex_native.sh +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (C) 2022 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions -# and limitations under the License. -# - -# - -# export CUDA_VISIBLE_DEVICES="-1"; \ -MODEL_NAME_OR_PATH="${MODEL_NAME_OR_PATH:-bert-large-uncased}" -DATASET="${DATASET:-sst2}" -MAX_SEQ_LEN=55 -NUM_TRAIN_EPOCHS=1 -OUTPUT_DIR="${OUTPUT_DIR:-fine_tuned}" -TRAINNING_BS=32 -INFERENCE_BS=8 - #--bf16_ft \ -python src/run_pt_native_ft.py \ - --model_name_or_path $MODEL_NAME_OR_PATH \ - --dataset $DATASET \ - --num_train_epochs $NUM_TRAIN_EPOCHS \ - --max_seq_len $MAX_SEQ_LEN \ - --output_dir $OUTPUT_DIR \ - --do_train \ - --per_device_train_batch_size $TRAINNING_BS \ - --do_predict \ - --per_device_eval_batch_size $INFERENCE_BS \ - --logging_strategy epoch \ - $@ diff --git a/profiling-transformers/inference/multi_instance.sh b/profiling-transformers/inference/inf_ipex_multi.sh similarity index 52% rename from profiling-transformers/inference/multi_instance.sh rename to profiling-transformers/inference/inf_ipex_multi.sh index c5f7965..7ce54b5 100755 --- a/profiling-transformers/inference/multi_instance.sh +++ b/profiling-transformers/inference/inf_ipex_multi.sh @@ -1,5 +1,6 @@ #!/bin/bash -# Copyright (C) 2022 Intel Corporation + +# Copyright (C) 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,22 +15,14 @@ # and limitations under the License. # -# -export KMP_SETTINGS=1 -export KMP_BLOCKTIME=1 -export OMP_MAX_ACTIVE_LEVELS=1 - -export LOG_NAME=`date "+%m%d-%H%M"` -export DATASET="sst2" -export NUMBER_OF_INSTANCE_PER_SOCKET=1 -export BATCH_SIZE=8 -export SEQUENCE_LEN=55 -export IPEX_BF16=0 -export IPEX_FP32=0 -export INT8=0 -export INT8_BF16=0 -export MODEL_NAME_OR_PATH="${MODEL_NAME_OR_PATH:-bert-large-uncased}" -export OUTPUT_DIR="${OUTPUT_DIR:-./logs}" +LOG_NAME=$(date "+%m%d-%H%M") +DATASET="sst2" +NUMBER_OF_INSTANCE_PER_SOCKET=1 +BATCH_SIZE=8 +SEQUENCE_LEN=55 +DTYPE_INF="fp32" +MODEL_NAME_OR_PATH="${MODEL_NAME_OR_PATH:-bert-large-uncased}" +OUTPUT_DIR="${OUTPUT_DIR:-./logs}" while [ "$1" != "" ]; do @@ -64,52 +57,35 @@ do SEQUENCE_LEN="$1" echo "sequence_len is : $SEQUENCE_LEN" ;; - --ipex_bf16 ) - IPEX_BF16=1 - echo "ipex_bf16 is : $IPEX_BF16" - ;; - --ipex_fp32 ) - IPEX_FP32=1 - echo "ipex_fp32 is : $IPEX_FP32" - ;; - --int8 ) - INT8=1 - echo "int8 is : $INT8" - ;; - --int8_bf16 ) - INT8_BF16=1 - echo "int8_bf16 is : $INT8_BF16" + --dtype_inf ) + shift + DTYPE_INF="$1" + echo "dtype_inf is : $DTYPE_INF" ;; -h | --help ) - echo "Usage: ./inference/multi_instance.sh [OPTIONS]" + echo "Usage: $0 [OPTIONS]" echo "OPTION includes:" echo " -l | --log_name - the log name of this round" - echo " -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET" + echo " -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET" echo " -n | --num_of_ins_per_socket - number of instance per socket" # echo " -c | --cores_per_instance - cores per instance" echo " -b | --batch_size - batch size per instance" echo " -s | --sequence_len - max sequence length" - echo " --ipex_bf16 - wether to use ipex_bf16 precision" - echo " --ipex_fp32 - wether to use ipex_fp32 precision" - echo " --int8 - wether to use int8 precision" - echo " --int8_bf16 - wether to use int8_bf16 precision" + echo " --dtype_inf - data type used for inference" echo " -h | --help - displays this message" exit ;; * ) echo "Invalid option: $1" - echo "Usage: inference/multi_instance.sh [OPTIONS]" + echo "Usage: $0 [OPTIONS]" echo "OPTION includes:" echo " -l | --log_name - the log name of this round" - echo " -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET" + echo " -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET" echo " -n | --num_of_ins_per_socket - number of instance per socket" # echo " -c | --cores_per_instance - cores per instance" echo " -b | --batch_size - batch size per instance" echo " -s | --sequence_len - max sequence length" - echo " --ipex_bf16 - wether to use ipex_bf16 precision" - echo " --ipex_fp32 - wether to use ipex_fp32 precision" - echo " --int8 - wether to use int8 precision" - echo " --int8_bf16 - wether to use int8_bf16 precision" + echo " --dtype_inf - data type used for inference" exit ;; esac @@ -117,7 +93,7 @@ do done if [ -z "$LOG_NAME" ]; then - pre=`date "+%m%d-%H%M"` + pre=$(date "+%m%d-%H%M") else pre=$LOG_NAME fi @@ -125,7 +101,7 @@ fi if [ -z "$DATASET" ]; then echo "Error: Please enter the DATASET ot use [imdb|sst2]" exit -elif [ $DATASET != "imdb" -a $DATASET != "sst2" ]; then +elif [ "$DATASET" != "imdb" ] && [ "$DATASET" != "sst2" ]; then echo "Error: The DATASET $DATASET cannot be recognized, please enter 'imdb' or 'sst2'" exit fi @@ -140,80 +116,66 @@ fi # exit #fi -if [ $IPEX_BF16 = 1 ]; then - if [ $INT8 = 1 -o $INT8_BF16 = 1 ]; then - echo "Error: Cannot set IPEX_BF16 and INT8 at the same time" - exit - fi -else - if [ $INT8 = 0 -a $INT8_BF16 = 1 ]; then - echo "Error: Cannot set INT8_BF16 without INT8 option" - exit - fi -fi - if [ -z "$BATCH_SIZE" ]; then echo "Error: Please set the batch size per instance using -b or --BATCH_SIZE" exit fi -if [ -z $SEQUENCE_LEN ]; then - if [ $DATASET = 'imdb' ]; then +if [ -z "$SEQUENCE_LEN" ]; then + if [ "$DATASET" = 'imdb' ]; then SEQUENCE_LEN=512 - elif [ $DATASET = 'sst2' ]; then + elif [ "$DATASET" = 'sst2' ]; then SEQUENCE_LEN=55 fi echo "WARNING: SEQUENCE_LEN is not set, using default DATASET ($DATASET) sequence length $SEQUENCE_LEN" fi -all_core_number=`cat /proc/cpuinfo |grep "processor"|wc -l` -socket_number=`lscpu | grep "Socket(s)" | awk '{print $2}'` -core_number_per_socket=$(($all_core_number / $socket_number)) -instance_number=$(($NUMBER_OF_INSTANCE_PER_SOCKET * $socket_number)) +all_core_number=$(grep -c "processor" /proc/cpuinfo) +socket_number=$(lscpu | grep "Socket(s)" | awk '{print $2}') +core_number_per_socket=$((all_core_number / socket_number)) +instance_number=$((NUMBER_OF_INSTANCE_PER_SOCKET * socket_number)) -if [ $(($core_number_per_socket % $NUMBER_OF_INSTANCE_PER_SOCKET)) != 0 ]; then - echo "\`instance_numberi_per_socket($NUMBER_OF_INSTANCE_PER_SOCKET)\` cannot be divisible by \`core_number_per_socket($core_number_per_socket)\`" +if [ $((core_number_per_socket % NUMBER_OF_INSTANCE_PER_SOCKET)) != 0 ]; then + echo "\`instance_number_per_socket($NUMBER_OF_INSTANCE_PER_SOCKET)\` cannot be divisible by \`core_number_per_socket($core_number_per_socket)\`" exit else - cores_per_instance=$(($core_number_per_socket / $NUMBER_OF_INSTANCE_PER_SOCKET)) + cores_per_instance=$((core_number_per_socket / NUMBER_OF_INSTANCE_PER_SOCKET)) fi -if [ $DATASET = 'imdb' ]; then - max_test_samples=$((25000/$instance_number)) +if [ "$DATASET" = 'imdb' ]; then + max_test_samples=$((25000/instance_number)) else - max_test_samples=$((872/$instance_number)) + max_test_samples=$((872/instance_number)) fi OUTPUT_DIR=$OUTPUT_DIR'/'$pre'/'$DATASET echo "log directory is $OUTPUT_DIR" -mkdir -p $OUTPUT_DIR +mkdir -p "$OUTPUT_DIR"/output_test for i in $(seq 1 $instance_number) do export OMP_NUM_THREADS=$cores_per_instance - start_index=$(( ($i-1) * $cores_per_instance)) - end_index=$(( ($i * $cores_per_instance) -1)) - mem_bind=$(( $start_index / $core_number_per_socket)) + start_index=$(( (i-1) * cores_per_instance)) + end_index=$(( (i * cores_per_instance) -1)) + mem_bind=$(( start_index / core_number_per_socket)) echo "\`start core index\` is $start_index" echo "\`end core index \` is $end_index" echo "\`memory bind\` is $mem_bind" str="numactl -C $start_index-$end_index -m $mem_bind" - echo $str - nohup numactl -C $start_index-$end_index -m $mem_bind python ./src/run_pt_native_inf.py \ - --model_name_or_path $MODEL_NAME_OR_PATH \ - --dataset $DATASET \ - --int8 $INT8 \ - --int8_bf16 $INT8_BF16 \ - --ipex_bf16 $IPEX_BF16 \ - --ipex_fp32 $IPEX_FP32 \ + echo "$str" + nohup numactl -C $start_index-$end_index -m $mem_bind python ./src/run_infer.py \ + --model_name_or_path "$MODEL_NAME_OR_PATH" \ + --dataset "$DATASET" \ + --dtype_inf "$DTYPE_INF" \ --multi_instance \ - --output_dir $OUTPUT_DIR/output_test \ + --output_dir "$OUTPUT_DIR"/output_test \ + --infer_impl ipex \ --do_predict \ --max_seq_len $SEQUENCE_LEN \ - --instance_index $i \ + --instance_index "$i" \ --max_test_samples $max_test_samples \ - --per_device_eval_batch_size $BATCH_SIZE \ - > $OUTPUT_DIR/test_$i.log 2>&1 & + --per_device_eval_batch_size "$BATCH_SIZE" \ + > "$OUTPUT_DIR"/test_"$i".log 2>&1 & done diff --git a/profiling-transformers/inference/inf_ipex_single.sh b/profiling-transformers/inference/inf_ipex_single.sh new file mode 100755 index 0000000..38b3d1a --- /dev/null +++ b/profiling-transformers/inference/inf_ipex_single.sh @@ -0,0 +1,104 @@ +#!/bin/bash + +# Copyright (C) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# + +LOG_NAME=$(date "+%m%d-%H%M") +DATASET="sst2" +BATCH_SIZE=8 +SEQUENCE_LEN=55 +DTYPE_INF="fp32" +MODEL_NAME_OR_PATH="${MODEL_NAME_OR_PATH:-bert-large-uncased}" +OUTPUT_DIR="${OUTPUT_DIR:-./logs}" + +while [ "$1" != "" ]; +do + case $1 in + -l | --log_name ) + shift + LOG_NAME="$1" + echo "log name is $LOG_NAME" + ;; + -d | --dataset ) + shift + DATASET="$1" + echo "dataset is : $DATASET" + ;; + -b | --batch_size ) + shift + BATCH_SIZE="$1" + echo "batch size per instance is : $BATCH_SIZE" + ;; + -s | --sequence_len ) + shift + SEQUENCE_LEN="$1" + echo "sequence_len is : $SEQUENCE_LEN" + ;; + --dtype_inf ) + shift + DTYPE_INF="$1" + echo "dtype_inf is : $DTYPE_INF" + ;; + -h | --help ) + echo "Usage: $0 [OPTIONS]" + echo "OPTION includes:" + echo " -l | --log_name - the log name of this round" + echo " -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET" + echo " -b | --batch_size - batch size per instance" + echo " -s | --sequence_len - max sequence length" + echo " --dtype_inf - data type used for inference" + echo " -h | --help - displays this message" + exit + ;; + * ) + echo "Invalid option: $1" + echo "Usage: $0 [OPTIONS]" + echo "OPTION includes:" + echo " -l | --log_name - the log name of this round" + echo " -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET" + echo " -b | --batch_size - batch size per instance" + echo " -s | --sequence_len - max sequence length" + echo " --dtype_inf - data type used for inference" + exit + ;; + esac + shift +done + +if [ -z "$LOG_NAME" ]; then + pre=$(date "+%m%d-%H%M") +else + pre=$LOG_NAME +fi + +OUTPUT_DIR=$OUTPUT_DIR'/'$pre'/'$DATASET +echo "$OUTPUT_DIR" + +mkdir -p "$OUTPUT_DIR"/output_test + + +export CUDA_VISIBLE_DEVICES="-1"; \ +python ./src/run_infer.py \ + --model_name_or_path "$MODEL_NAME_OR_PATH" \ + --dataset "$DATASET" \ + --output_dir "$OUTPUT_DIR"/output_test \ + --infer_impl ipex \ + --dtype_inf "$DTYPE_INF" \ + --do_predict \ + --max_seq_len "$SEQUENCE_LEN" \ + --per_device_eval_batch_size "$BATCH_SIZE" \ + 2>&1 | tee "$OUTPUT_DIR"/test.log + + diff --git a/profiling-transformers/inference/cpu_multi_instance.sh b/profiling-transformers/inference/inf_trainer_multi.sh similarity index 60% rename from profiling-transformers/inference/cpu_multi_instance.sh rename to profiling-transformers/inference/inf_trainer_multi.sh index 807ed99..564f070 100755 --- a/profiling-transformers/inference/cpu_multi_instance.sh +++ b/profiling-transformers/inference/inf_trainer_multi.sh @@ -1,4 +1,6 @@ -# Copyright (C) 2022 Intel Corporation +#!/bin/bash + +# Copyright (C) 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,21 +15,15 @@ # and limitations under the License. # -# - -export KMP_SETTINGS=1 -export KMP_BLOCKTIME=1 -export OMP_MAX_ACTIVE_LEVELS=1 - -export LOG_NAME=`date "+%m%d-%H%M"` -export DATASET="sst2" -export NUMBER_OF_INSTANCE_PER_SOCKET=1 -export BATCH_SIZE=8 -export SEQUENCE_LEN=55 -export MODEL_NAME_OR_PATH="${MODEL_NAME_OR_PATH:-bert-large-uncased}" -export OUTPUT_DIR="${OUTPUT_DIR:-./logs}" -export USE_IPEX="" -export BF16="" +LOG_NAME=$(date "+%m%d-%H%M") +DATASET="sst2" +NUMBER_OF_INSTANCE_PER_SOCKET=1 +BATCH_SIZE=8 +SEQUENCE_LEN=55 +MODEL_NAME_OR_PATH="${MODEL_NAME_OR_PATH:-bert-large-uncased}" +OUTPUT_DIR="${OUTPUT_DIR:-./logs}" +DTYPE_INF="fp32" +APPEND="" while [ "$1" != "" ]; @@ -63,48 +59,47 @@ do SEQUENCE_LEN="$1" echo "sequence_len is : $SEQUENCE_LEN" ;; - --use_ipex ) - USE_IPEX="--use_ipex" - echo " use ipex" - ;; - --bf16 ) - BF16="--bf16" - echo "using hf bf16 inference" - ;; + --dtype_inf ) + shift + DTYPE_INF="$1" + echo "dtype_inf is : $DTYPE_INF" + ;; -h | --help ) - echo "Usage: ./inference/cpu_multi_instance.sh [OPTIONS]" + echo "Usage: $0 [OPTIONS]" echo "OPTION includes:" echo " -l | --log_name - the log name of this round" - echo " -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET" + echo " -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET" echo " -n | --num_of_ins_per_socket - number of instance per socket" # echo " -c | --cores_per_instance - cores per instance" echo " -b | --batch_size - batch size per instance" echo " -s | --sequence_len - max sequence length" - echo " --bf16 - whether using hf bf16 inference" - echo " --use_ipex - whether using ipex" + echo " --dtype_inf - data type used for inference" echo " -h | --help - displays this message" exit ;; * ) echo "Invalid option: $1" - echo "Usage: inference/cpu_multi_instance.sh [OPTIONS]" + echo "Usage: $0 [OPTIONS]" echo "OPTION includes:" echo " -l | --log_name - the log name of this round" - echo " -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET" + echo " -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET" echo " -n | --num_of_ins_per_socket - number of instance per socket" # echo " -c | --cores_per_instance - cores per instance" echo " -b | --batch_size - batch size per instance" echo " -s | --sequence_len - max sequence length" - echo " --bf16 - whether using hf bf16 inference" - echo " --use_ipex - whether using ipex" + echo " --dtype_inf - data type used for inference" exit ;; esac shift done +if [ "$DTYPE_INF" == "bf16" ]; then + APPEND="--bf16 --use_ipex" +fi + if [ -z "$LOG_NAME" ]; then - pre=`date "+%m%d-%H%M"` + pre=$(date "+%m%d-%H%M") else pre=$LOG_NAME fi @@ -112,7 +107,7 @@ fi if [ -z "$DATASET" ]; then echo "Error: Please enter the DATASET ot use [imdb|sst2]" exit -elif [ $DATASET != "imdb" -a $DATASET != "sst2" ]; then +elif [ "$DATASET" != "imdb" ] && [ "$DATASET" != "sst2" ]; then echo "Error: The DATASET $DATASET cannot be recognized, please enter 'imdb' or 'sst2'" exit fi @@ -132,62 +127,61 @@ if [ -z "$BATCH_SIZE" ]; then exit fi -if [ -z $SEQUENCE_LEN ]; then - if [ $DATASET = 'imdb' ]; then +if [ -z "$SEQUENCE_LEN" ]; then + if [ "$DATASET" = 'imdb' ]; then SEQUENCE_LEN=512 - elif [ $DATASET = 'sst2' ]; then + elif [ "$DATASET" = 'sst2' ]; then SEQUENCE_LEN=55 fi echo "WARNING: SEQUENCE_LEN is not set, using default DATASET ($DATASET) sequence length $SEQUENCE_LEN" fi +all_core_number=$(grep -c "processor" /proc/cpuinfo) +socket_number=$(lscpu | grep "Socket(s)" | awk '{print $2}') +core_number_per_socket=$((all_core_number / socket_number)) +instance_number=$((NUMBER_OF_INSTANCE_PER_SOCKET * socket_number)) -all_core_number=`cat /proc/cpuinfo |grep "processor"|wc -l` -socket_number=`lscpu | grep "Socket(s)" | awk '{print $2}'` -core_number_per_socket=$(($all_core_number / $socket_number)) -instance_number=$(($NUMBER_OF_INSTANCE_PER_SOCKET * $socket_number)) - -if [ $(($core_number_per_socket % $NUMBER_OF_INSTANCE_PER_SOCKET)) != 0 ]; then - echo "\`instance_numberi_per_socket($NUMBER_OF_INSTANCE_PER_SOCKET)\` cannot be divisible by \`core_number_per_socket($core_number_per_socket)\`" +if [ $((core_number_per_socket % NUMBER_OF_INSTANCE_PER_SOCKET)) != 0 ]; then + echo "\`instance_number_per_socket($NUMBER_OF_INSTANCE_PER_SOCKET)\` cannot be divisible by \`core_number_per_socket($core_number_per_socket)\`" exit else - cores_per_instance=$(($core_number_per_socket / $NUMBER_OF_INSTANCE_PER_SOCKET)) + cores_per_instance=$((core_number_per_socket / NUMBER_OF_INSTANCE_PER_SOCKET)) fi -if [ $DATASET = 'imdb' ]; then - max_test_samples=$((25000/$instance_number)) +if [ "$DATASET" = 'imdb' ]; then + max_test_samples=$((25000/instance_number)) else - max_test_samples=$((872/$instance_number)) + max_test_samples=$((872/instance_number)) fi OUTPUT_DIR=$OUTPUT_DIR'/'$pre'/'$DATASET echo "log directory is $OUTPUT_DIR" -mkdir -p $OUTPUT_DIR +mkdir -p "$OUTPUT_DIR"/output_test for i in $(seq 1 $instance_number) do export OMP_NUM_THREADS=$cores_per_instance - start_index=$(( ($i-1) * $cores_per_instance)) - end_index=$(( ($i * $cores_per_instance) -1)) - mem_bind=$(( $start_index / $core_number_per_socket)) + start_index=$(( (i-1) * cores_per_instance)) + end_index=$(( (i * cores_per_instance) -1)) + mem_bind=$(( start_index / core_number_per_socket)) echo "\`start core index\` is $start_index" echo "\`end core index \` is $end_index" echo "\`memory bind\` is $mem_bind" str="numactl -C $start_index-$end_index -m $mem_bind" - echo $str - nohup numactl -C $start_index-$end_index -m $mem_bind python ./src/run_pt.py \ - --model_name_or_path $MODEL_NAME_OR_PATH \ - --dataset $DATASET \ + echo "$str" + nohup numactl -C $start_index-$end_index -m $mem_bind python ./src/run_infer.py \ + --model_name_or_path "$MODEL_NAME_OR_PATH" \ + --dataset "$DATASET" \ --multi_instance \ - --output_dir $OUTPUT_DIR/output_test \ + --output_dir "$OUTPUT_DIR"/output_test \ + --infer_impl trainer \ --do_predict \ --max_seq_len $SEQUENCE_LEN \ - --instance_index $i \ + --instance_index "$i" \ --max_test_samples $max_test_samples \ - --per_device_eval_batch_size $BATCH_SIZE \ + --per_device_eval_batch_size "$BATCH_SIZE" \ --no_cuda \ - $USE_IPEX \ - $BF16 \ - > $OUTPUT_DIR/test_$i.log 2>&1 & + "$APPEND" \ + > "$OUTPUT_DIR"/test_"$i".log 2>&1 & done diff --git a/profiling-transformers/inference/cpu_single_instance.sh b/profiling-transformers/inference/inf_trainer_single.sh similarity index 58% rename from profiling-transformers/inference/cpu_single_instance.sh rename to profiling-transformers/inference/inf_trainer_single.sh index cb61c4f..fd6435f 100755 --- a/profiling-transformers/inference/cpu_single_instance.sh +++ b/profiling-transformers/inference/inf_trainer_single.sh @@ -1,4 +1,6 @@ -# Copyright (C) 2022 Intel Corporation +#!/bin/bash + +# Copyright (C) 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,17 +15,14 @@ # and limitations under the License. # -# -# -# export CUDA_VISIBLE_DEVICES="-1"; \ -export LOG_NAME=`date "+%m%d-%H%M"` -export DATASET="sst2" -export BATCH_SIZE=8 -export SEQUENCE_LEN=55 -export MODEL_NAME_OR_PATH="${MODEL_NAME_OR_PATH:-bert-large-uncased}" -export OUTPUT_DIR="${OUTPUT_DIR:-./logs}" -export USE_IPEX="" -export BF16="" +LOG_NAME=$(date "+%m%d-%H%M") +DATASET="sst2" +BATCH_SIZE=8 +SEQUENCE_LEN=55 +MODEL_NAME_OR_PATH="${MODEL_NAME_OR_PATH:-bert-large-uncased}" +OUTPUT_DIR="${OUTPUT_DIR:-./logs}" +DTYPE_INF="fp32" +APPEND="" while [ "$1" != "" ]; do @@ -48,64 +47,63 @@ do SEQUENCE_LEN="$1" echo "sequence_len is : $SEQUENCE_LEN" ;; - --use_ipex ) - USE_IPEX="--use_ipex" - echo " use ipex" - ;; - --bf16 ) - BF16="--bf16" - echo "using hf bf16 inference" + --dtype_inf ) + shift + DTYPE_INF="$1" + echo "dtype_inf is : $DTYPE_INF" ;; -h | --help ) - echo "Usage: ./inference/cpu_single_instance.sh [OPTIONS]" + echo "Usage: $0 [OPTIONS]" echo "OPTION includes:" echo " -l | --log_name - the log name of this round" - echo " -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET" + echo " -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET" echo " -b | --batch_size - batch size per instance" echo " -s | --sequence_len - max sequence length" - echo " --bf16 - whether using hf bf16 inference" - echo " --use_ipex - whether using ipex" + echo " --dtype_inf - data type used for inference" echo " -h | --help - displays this message" exit ;; * ) echo "Invalid option: $1" - echo "Usage: ./inference/cpu_single_instance.sh [OPTIONS]" + echo "Usage: $0 [OPTIONS]" echo "OPTION includes:" echo " -l | --log_name - the log name of this round" - echo " -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET" + echo " -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET" echo " -b | --batch_size - batch size per instance" echo " -s | --sequence_len - max sequence length" - echo " --bf16 - whether using hf bf16 inference" - echo " --use_ipex - whether using ipex" + echo " --dtype_inf - data type used for inference" exit ;; esac shift done +if [ "$DTYPE_INF" == "bf16" ]; then + APPEND="--bf16 --use_ipex" +fi + if [ -z "$LOG_NAME" ]; then - pre=`date "+%m%d-%H%M"` + pre=$(date "+%m%d-%H%M") else pre=$LOG_NAME fi OUTPUT_DIR=$OUTPUT_DIR'/'$pre'/'$DATASET -echo $OUTPUT_DIR +echo "$OUTPUT_DIR" -mkdir -p $OUTPUT_DIR +mkdir -p "$OUTPUT_DIR"/output_test export CUDA_VISIBLE_DEVICES="-1"; \ -python ./src/run_pt.py \ - --model_name_or_path $MODEL_NAME_OR_PATH \ - --dataset $DATASET \ - --output_dir $OUTPUT_DIR/output_test \ +python ./src/run_infer.py \ + --model_name_or_path "$MODEL_NAME_OR_PATH" \ + --dataset "$DATASET" \ + --output_dir "$OUTPUT_DIR"/output_test \ + --infer_impl trainer \ --do_predict \ - --max_seq_len $SEQUENCE_LEN \ - --per_device_eval_batch_size $BATCH_SIZE \ + --max_seq_len "$SEQUENCE_LEN" \ + --per_device_eval_batch_size "$BATCH_SIZE" \ --no_cuda \ - $USE_IPEX \ - $BF16 \ - 2>&1 | tee $OUTPUT_DIR/test_$i.log + "$APPEND" \ + 2>&1 | tee "$OUTPUT_DIR"/test.log \ diff --git a/profiling-transformers/inference/single_instance.sh b/profiling-transformers/inference/single_instance.sh deleted file mode 100755 index ea01f64..0000000 --- a/profiling-transformers/inference/single_instance.sh +++ /dev/null @@ -1,126 +0,0 @@ -# Copyright (C) 2022 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions -# and limitations under the License. -# - -# - -export LOG_NAME=`date "+%m%d-%H%M"` -export DATASET="sst2" -export BATCH_SIZE=8 -export SEQUENCE_LEN=55 -export IPEX_BF16=0 -export IPEX_FP32=0 -export INT8=0 -export INT8_BF16=0 -export MODEL_NAME_OR_PATH="${MODEL_NAME_OR_PATH:-bert-large-uncased}" -export OUTPUT_DIR="${OUTPUT_DIR:-./logs}" - -while [ "$1" != "" ]; -do - case $1 in - -l | --log_name ) - shift - LOG_NAME="$1" - echo "log name is $LOG_NAME" - ;; - -d | --dataset ) - shift - DATASET="$1" - echo "dataset is : $DATASET" - ;; - -b | --batch_size ) - shift - BATCH_SIZE="$1" - echo "batch size per instance is : $BATCH_SIZE" - ;; - -s | --sequence_len ) - shift - SEQUENCE_LEN="$1" - echo "sequence_len is : $SEQUENCE_LEN" - ;; - --ipex_bf16 ) - IPEX_BF16=1 - echo "ipex_bf16 is : $IPEX_BF16" - ;; - --ipex_fp32 ) - IPEX_FP32=1 - echo "ipex_fp32 is : $IPEX_FP32" - ;; - --int8 ) - INT8=1 - echo "int8 is : $INT8" - ;; - --int8_bf16 ) - INT8_BF16=1 - echo "int8_bf16 is : $INT8_BF16" - ;; - -h | --help ) - echo "Usage: ././inference/single_instance.sh [OPTIONS]" - echo "OPTION includes:" - echo " -l | --log_name - the log name of this round" - echo " -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET" - echo " -b | --batch_size - batch size per instance" - echo " -s | --sequence_len - max sequence length" - echo " --ipex_bf16 - wether to use ipex_bf16 precision" - echo " --ipex_fp32 - wether to use ipex_fp32 precision" - echo " --int8 - wether to use int8 precision" - echo " --int8_bf16 - wether to use int8_bf16 precision" - echo " -h | --help - displays this message" - exit - ;; - * ) - echo "Invalid option: $1" - echo "Usage: ./inference/single_instance.sh [OPTIONS]" - echo "OPTION includes:" - echo " -l | --log_name - the log name of this round" - echo " -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET" - echo " -b | --batch_size - batch size per instance" - echo " -s | --sequence_len - max sequence length" - echo " --ipex_bf16 - wether to use ipex_bf16 precision" - echo " --ipex_fp32 - wether to use ipex_fp32 precision" - echo " --int8 - wether to use int8 precision" - echo " --int8_bf16 - wether to use int8_bf16 precision" - exit - ;; - esac - shift -done - -if [ -z "$LOG_NAME" ]; then - pre=`date "+%m%d-%H%M"` -else - pre=$LOG_NAME -fi - -OUTPUT_DIR=$OUTPUT_DIR'/'$pre'/'$DATASET -echo $OUTPUT_DIR - -mkdir -p $OUTPUT_DIR - - -export CUDA_VISIBLE_DEVICES="-1"; \ -python ./src/run_pt_native_inf.py \ - --model_name_or_path $MODEL_NAME_OR_PATH \ - --dataset $DATASET \ - --int8 $INT8 \ - --int8_bf16 $INT8_BF16 \ - --ipex_bf16 $IPEX_BF16 \ - --ipex_fp32 $IPEX_FP32 \ - --output_dir $OUTPUT_DIR/output_test \ - --do_predict \ - --max_seq_len $SEQUENCE_LEN \ - --per_device_eval_batch_size $BATCH_SIZE \ - 2>&1 | tee $OUTPUT_DIR/test_$i.log - - diff --git a/profiling-transformers/install.sh b/profiling-transformers/install.sh index 4bb6d1f..2a7e6e5 100755 --- a/profiling-transformers/install.sh +++ b/profiling-transformers/install.sh @@ -1,5 +1,6 @@ -#!/usr/bin/bash -# Copyright (C) 2022 Intel Corporation +#!/bin/bash + +# Copyright (C) 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,7 +15,7 @@ # and limitations under the License. # -# -conda install -y pytorch==1.12.1 torchvision torchaudio cpuonly intel-openmp gperftools ninja setuptools tqdm future cmake numpy pyyaml scikit-learn pydot -c pytorch -c intel -c conda-forge -pip install transformers==4.21.1 datasets==2.3.2 intel_extension_for_pytorch -bash deploy/install_torch_ccl.sh \ No newline at end of file +conda install -y pytorch==1.12.1 torchvision torchaudio cpuonly intel-openmp gperftools ninja setuptools tqdm future cmake numpy pyyaml scikit-learn pydot impi_rt impi-devel -c pytorch -c intel -c conda-forge +pip install transformers==4.21.1 datasets==2.3.2 intel_extension_for_pytorch==1.12.300 +bash deploy/install_torch_ccl.sh +bash deploy/install_tpp.sh diff --git a/profiling-transformers/src/__init__.py b/profiling-transformers/src/__init__.py index 356ca7b..e69de29 100644 --- a/profiling-transformers/src/__init__.py +++ b/profiling-transformers/src/__init__.py @@ -1,16 +0,0 @@ -# Copyright (C) 2022 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions -# and limitations under the License. -# - -# diff --git a/profiling-transformers/src/finetune.py b/profiling-transformers/src/finetune.py new file mode 100644 index 0000000..fd4b867 --- /dev/null +++ b/profiling-transformers/src/finetune.py @@ -0,0 +1,90 @@ +# Copyright (C) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# + + +from datasets import load_dataset +from transformers import AutoTokenizer + +from utils import Benchmark + + +class DlsaFinetune(object): + def __init__(self, **kwargs): + self.args = kwargs['args'] + self.training_args = kwargs['training_args'] + + self.max_train, self.max_test = self.args.max_train_samples, self.args.max_test_samples + if self.args.smoke_test: + self.max_train, self.max_test = 100, 100 + + self.bench = Benchmark() + self.track = self.bench.track + + def e2e_finetune(self): + with self.track('Total Run'): + self._load_data() + self._preprocess() + self._load_model() + self._do_finetune() + self._do_infer() + self.bench.summary() + + def _load_data(self): + with self.track('Load Data'): + data = load_dataset(self.args.dataset) + train_all = data['train'] + test_split = 'validation' if self.args.dataset == 'sst2' else 'test' + len_train = len(train_all) + self.train_data = train_all.select( + range(len_train - self.max_train, len_train)) if self.max_train else train_all + self.test_data = data[test_split].select(range(self.max_test)) if self.max_test else data[test_split] + self.text_column = [c for c in self.test_data.column_names if type(self.test_data[c][0]) != int][0] + + def _preprocess(self): + with self.track('Pre-process'): + with self.track('----Init tokenizer'): + self.tokenizer = AutoTokenizer.from_pretrained( + self.args.tokenizer_name if self.args.tokenizer_name else self.args.model_name_or_path + ) + + max_seq_len = min(self.args.max_seq_len, self.tokenizer.model_max_length) + + with self.track('----Tokenize + Extract Features'): + def preprocess(examples): + return self.tokenizer( + examples[self.text_column], + padding='max_length', + truncation=True, + max_length=max_seq_len + ) + + kwargs = dict( + function=preprocess, + batched=True, + num_proc=self.args.preprocessing_num_workers, + remove_columns=[self.text_column] + (['idx'] if self.args.dataset == 'sst2' else []), + load_from_cache_file=not self.args.overwrite_cache) + + self.train_data = self.train_data.map(**kwargs) if self.training_args.do_train else None + self.test_data = self.test_data.map(**kwargs) if self.training_args.do_predict else None + + def _load_model(self): + raise NotImplementedError + + def _do_finetune(self): + raise NotImplementedError + + def _do_infer(self): + raise NotImplementedError diff --git a/profiling-transformers/src/finetune_ipex.py b/profiling-transformers/src/finetune_ipex.py new file mode 100644 index 0000000..7ae8a95 --- /dev/null +++ b/profiling-transformers/src/finetune_ipex.py @@ -0,0 +1,108 @@ +# Copyright (C) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# + + +import intel_extension_for_pytorch as ipex +import numpy as np +import torch +from torch.utils.data import DataLoader +from tqdm import tqdm +from transformers import ( + AutoModelForSequenceClassification, + DataCollatorWithPadding +) + +from finetune import DlsaFinetune +from utils import compute_metrics, PredsLabels + + + +class FinetuneIpex(DlsaFinetune): + def _load_data(self): + return super()._load_data() + + def _preprocess(self): + return super()._preprocess() + + def _load_model(self): + if self.training_args.do_train: + with self.track('Load Model'): + if self.args.dtype_ft == "fp32": + self.model = AutoModelForSequenceClassification \ + .from_pretrained(self.args.model_name_or_path) + self.model = ipex.optimize(self.model, dtype=torch.float32, level='O1') + + elif self.args.dtype_ft == "bf16": + with torch.cpu.amp.autocast(): + self.model = AutoModelForSequenceClassification \ + .from_pretrained(self.args.model_name_or_path) + self.model = ipex.optimize(self.model, dtype=torch.bfloat16, level='O0') + else: + error_msg = f'Now only support fp32, bf16.Your input datatype is {self.args.dtype_ft}.' + raise ValueError(error_msg) + + def _do_finetune(self): + if self.training_args.do_train: + with self.track('Fine-Tune'): + with self.track('--------Init Fine-Tuning'): + batch_size = self.training_args.per_device_train_batch_size + self.model.train() + weight_decay = 0.0 + no_decay = ["bias", "LayerNorm.weight"] + optimizer_grouped_parameters = [ + { + "params": [p for n, p in self.model.named_parameters() if + not any(nd in n for nd in no_decay)], + "weight_decay": weight_decay, + }, + { + "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], + "weight_decay": 0.0, + }, + ] + optim = torch.optim.AdamW(optimizer_grouped_parameters, lr=self.training_args.learning_rate) + + with self.track('--------Training Loop'): + for _ in tqdm(range(int(self.training_args.num_train_epochs)), desc='Epoch'): + for batch in tqdm(DataLoader(self.train_data, batch_size=batch_size, shuffle=True, + collate_fn=DataCollatorWithPadding(self.tokenizer)), + desc='Train Step'): + optim.zero_grad() + loss = self.model(**batch)[0] + loss.backward() + optim.step() + + with self.track('--------Save Fine-Tuned Model'): + torch.save(self.model.state_dict(), self.training_args.output_dir + "/pytorch_model.bin") + + def _do_infer(self): + if self.training_args.do_predict: + with self.track('Inference'): + batch_size = self.training_args.per_device_eval_batch_size + all_outputs, all_labels = [], [] + + def prediction_step(batch, labels): + all_labels.extend(labels) + inputs = batch + output = self.model(**inputs) + all_outputs.append(output['logits'].detach().cpu()) + + self.model.eval() + with torch.no_grad(): + for batch in tqdm(DataLoader(self.test_data, batch_size=batch_size, + collate_fn=DataCollatorWithPadding(self.tokenizer)), desc='Test Step'): + prediction_step(batch=batch, labels=batch.pop('labels')) + acc = compute_metrics(PredsLabels(preds=np.concatenate(all_outputs), labels=all_labels)) + print(f"\n*********** TEST_METRICS ***********\nAccuracy: {acc['acc']}\n") diff --git a/profiling-transformers/src/finetune_ipex_dist.py b/profiling-transformers/src/finetune_ipex_dist.py new file mode 100644 index 0000000..d74a487 --- /dev/null +++ b/profiling-transformers/src/finetune_ipex_dist.py @@ -0,0 +1,138 @@ +# Copyright (C) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# + +import os + +import numpy as np +import torch +from torch.utils.data import DataLoader, RandomSampler +from torch.utils.data.distributed import DistributedSampler +from tqdm import tqdm +from transformers import ( + set_seed, + DataCollatorWithPadding +) + +from finetune_ipex import FinetuneIpex +from utils import compute_metrics, PredsLabels + + + +class FinetuneIpexDist(FinetuneIpex): + def __init__(self, **kwargs): + super().__init__(**kwargs) + set_seed(self.training_args.seed) + + if int(os.environ.get('PMI_SIZE', '0')) > 1 and not self.args.multi_instance: + if self.args.dist_backend == 'ccl': + try: + import oneccl_bindings_for_pytorch + except ImportError: + print("CCL backend requested but import oneccl_bindings_for_pytorch failed") + raise + elif self.args.dist_backend == 'mpi': + if not torch.distributed.is_mpi_available(): + try: + import torch_mpi + except ImportError: + print("MPI backend requested but not available try installing torch_mpi module") + raise + else: + raise ValueError(f"{self.args.dist_backend} backend requested but not supported") + + os.environ['RANK'] = os.environ.get('PMI_RANK', '0') + os.environ['WORLD_SIZE'] = os.environ.get('PMI_SIZE', '1') + torch.distributed.init_process_group(backend=self.args.dist_backend) + self.training_args.local_rank = torch.distributed.get_rank() + if self.training_args.local_rank == 0: + print( + f"##################Using {self.args.dist_backend.upper()} dist " + f"run with {torch.distributed.get_world_size()} ranks", + flush=True) + + def _load_data(self): + return super()._load_data() + + def _preprocess(self): + return super()._preprocess() + + def _load_model(self): + return super()._load_model() + + def _do_finetune(self): + if self.training_args.do_train: + with self.track('Fine-Tune'): + with self.track('--------Init Fine-Tuning'): + batch_size = self.training_args.per_device_train_batch_size + self.model.train() + weight_decay = 0.0 + no_decay = ["bias", "LayerNorm.weight"] + optimizer_grouped_parameters = [ + { + "params": [p for n, p in self.model.named_parameters() if + not any(nd in n for nd in no_decay)], + "weight_decay": weight_decay, + }, + { + "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], + "weight_decay": 0.0, + }, + ] + optim = torch.optim.AdamW(optimizer_grouped_parameters, lr=self.training_args.learning_rate) + if self.training_args.local_rank != -1: + self.model = torch.nn.parallel.DistributedDataParallel(self.model) + + with self.track('--------Training Loop'): + train_sampler = RandomSampler( + self.train_data) if self.training_args.local_rank == -1 else DistributedSampler( + self.train_data) + + for _ in tqdm(range(int(self.training_args.num_train_epochs)), desc='Epoch', + disable=self.training_args.local_rank not in [-1, 0]): + for batch in tqdm(DataLoader(self.train_data, sampler=train_sampler, batch_size=batch_size, + collate_fn=DataCollatorWithPadding(self.tokenizer)), + desc='Train Step', disable=self.training_args.local_rank not in [-1, 0]): + optim.zero_grad() + loss = self.model(**batch)[0] + loss.backward() + optim.step() + + with self.track('--------Save Fine-Tuned Model'): + torch.save(self.model.state_dict(), self.training_args.output_dir + "/pytorch_model.bin") + + def _do_infer(self): + if self.training_args.do_predict: + with self.track('Inference'): + batch_size = self.training_args.per_device_eval_batch_size + all_outputs, all_labels = [], [] + + def prediction_step(batch, labels): + all_labels.extend(labels) + inputs = batch + output = self.model(**inputs) + all_outputs.append(output['logits'].detach().cpu()) + + self.model.eval() + with torch.no_grad(): + test_sampler = RandomSampler( + self.test_data) if self.training_args.local_rank == -1 else DistributedSampler( + self.test_data) + + for batch in tqdm(DataLoader(self.test_data, sampler=test_sampler, batch_size=batch_size, + collate_fn=DataCollatorWithPadding(self.tokenizer)), + desc='Test Step'): + prediction_step(batch=batch, labels=batch.pop('labels')) + acc = compute_metrics(PredsLabels(preds=np.concatenate(all_outputs), labels=all_labels)) + print(f"\n*********** TEST_METRICS ***********\nAccuracy: {acc['acc']}\n") diff --git a/profiling-transformers/src/finetune_tpp.py b/profiling-transformers/src/finetune_tpp.py new file mode 100644 index 0000000..0916c53 --- /dev/null +++ b/profiling-transformers/src/finetune_tpp.py @@ -0,0 +1,109 @@ +# Copyright (C) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# + + +import numpy as np +import torch +from torch.utils.data import DataLoader +from tpp_pytorch_extension import bert as tpp_bert +from tqdm import tqdm +from transformers import ( + AutoModelForSequenceClassification, + DataCollatorWithPadding +) + +from finetune import DlsaFinetune +from utils import compute_metrics, PredsLabels + + + +class FinetuneTpp(DlsaFinetune): + + def _load_data(self): + return super()._load_data() + + def _preprocess(self): + return super()._preprocess() + + def _load_model(self): + if self.training_args.do_train: + with self.track('Load Model'): + if self.args.dtype_ft == "fp32": + with tpp_bert.tpp_impl(self.args.use_tpp, False, self.args.unpad): + self.model = AutoModelForSequenceClassification.from_pretrained(self.args.model_name_or_path) + + elif self.args.dtype_ft == "bf16": + with tpp_bert.tpp_impl(self.args.use_tpp, True, self.args.unpad): + self.model = AutoModelForSequenceClassification.from_pretrained(self.args.model_name_or_path) + else: + error_msg = f'Now only support fp32, bf16.Your input datatype is {self.args.dtype_ft}.' + raise ValueError(error_msg) + + if self.args.use_tpp: + tpp_bert.block(self.model) + + def _do_finetune(self): + if self.training_args.do_train: + with self.track('Fine-Tune'): + with self.track('--------Init Fine-Tuning'): + batch_size = self.training_args.per_device_train_batch_size + self.model.train() + weight_decay = 0.0 + no_decay = ["bias", "LayerNorm.weight"] + optimizer_grouped_parameters = [ + { + "params": [p for n, p in self.model.named_parameters() if + not any(nd in n for nd in no_decay)], + "weight_decay": weight_decay, + }, + { + "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], + "weight_decay": 0.0, + }, + ] + optim = tpp_bert.AdamW(optimizer_grouped_parameters, lr=self.training_args.learning_rate) + + with self.track('--------Training Loop'): + for _ in tqdm(range(int(self.training_args.num_train_epochs)), desc='Epoch'): + for batch in tqdm(DataLoader(self.train_data, batch_size=batch_size, shuffle=True, + collate_fn=DataCollatorWithPadding(self.tokenizer)), + desc='Train Step'): + optim.zero_grad() + loss = self.model(**batch)[0] + loss.backward() + optim.step() + + with self.track('--------Save Fine-Tuned Model'): + torch.save(self.model.state_dict(), self.training_args.output_dir + "/pytorch_model.bin") + + def _do_infer(self): + if self.training_args.do_predict: + with self.track('Inference'): + batch_size = self.training_args.per_device_eval_batch_size + all_outputs, all_labels = [], [] + + def prediction_step(batch, labels): + all_labels.extend(labels) + inputs = batch + output = self.model(**inputs) + all_outputs.append(output['logits'].detach().cpu()) + + self.model.eval() + with torch.no_grad(): + for batch in tqdm(DataLoader(self.test_data, batch_size=batch_size, + collate_fn=DataCollatorWithPadding(self.tokenizer)), desc='Test Step'): + prediction_step(batch=batch, labels=batch.pop('labels')) + acc = compute_metrics(PredsLabels(preds=np.concatenate(all_outputs), labels=all_labels)) + print(f"\n*********** TEST_METRICS ***********\nAccuracy: {acc['acc']}\n") diff --git a/profiling-transformers/src/finetune_tpp_dist.py b/profiling-transformers/src/finetune_tpp_dist.py new file mode 100644 index 0000000..0238ce8 --- /dev/null +++ b/profiling-transformers/src/finetune_tpp_dist.py @@ -0,0 +1,139 @@ +# Copyright (C) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# + +import os + +import numpy as np +import torch +from torch.utils.data import DataLoader, RandomSampler +from torch.utils.data.distributed import DistributedSampler +from tpp_pytorch_extension import bert as tpp_bert +from tqdm import tqdm +from transformers import ( + DataCollatorWithPadding, + set_seed +) + +from finetune_tpp import FinetuneTpp +from utils import compute_metrics, PredsLabels + + + +class FinetuneTppDist(FinetuneTpp): + def __init__(self, **kwargs): + super().__init__(**kwargs) + set_seed(self.training_args.seed) + + if int(os.environ.get('PMI_SIZE', '0')) > 1 and not self.args.multi_instance: + if self.args.dist_backend == 'ccl': + try: + import oneccl_bindings_for_pytorch + except ImportError: + print("CCL backend requested but import oneccl_bindings_for_pytorch failed") + raise + elif self.args.dist_backend == 'mpi': + if not torch.distributed.is_mpi_available(): + try: + import torch_mpi + except ImportError: + print("MPI backend requested but not available try installing torch_mpi module") + raise + else: + raise ValueError(f"{self.args.dist_backend} backend requested but not supported") + + os.environ['RANK'] = os.environ.get('PMI_RANK', '0') + os.environ['WORLD_SIZE'] = os.environ.get('PMI_SIZE', '1') + torch.distributed.init_process_group(backend=self.args.dist_backend) + self.training_args.local_rank = torch.distributed.get_rank() + if self.training_args.local_rank == 0: + print( + f"##################Using {self.args.dist_backend.upper()} dist " + f"run with {torch.distributed.get_world_size()} ranks", + flush=True) + + def _load_data(self): + return super()._load_data() + + def _preprocess(self): + return super()._preprocess() + + def _load_model(self): + return super()._load_model() + + def _do_finetune(self): + if self.training_args.do_train: + with self.track('Fine-Tune'): + with self.track('--------Init Fine-Tuning'): + batch_size = self.training_args.per_device_train_batch_size + self.model.train() + weight_decay = 0.0 + no_decay = ["bias", "LayerNorm.weight"] + optimizer_grouped_parameters = [ + { + "params": [p for n, p in self.model.named_parameters() if + not any(nd in n for nd in no_decay)], + "weight_decay": weight_decay, + }, + { + "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], + "weight_decay": 0.0, + }, + ] + optim = tpp_bert.AdamW(optimizer_grouped_parameters, lr=self.training_args.learning_rate) + if self.training_args.local_rank != -1: + self.model = torch.nn.parallel.DistributedDataParallel(self.model) + + with self.track('--------Training Loop'): + train_sampler = RandomSampler( + self.train_data) if self.training_args.local_rank == -1 else DistributedSampler( + self.train_data) + + for _ in tqdm(range(int(self.training_args.num_train_epochs)), desc='Epoch', + disable=self.training_args.local_rank not in [-1, 0]): + for batch in tqdm(DataLoader(self.train_data, sampler=train_sampler, batch_size=batch_size, + collate_fn=DataCollatorWithPadding(self.tokenizer)), + desc='Train Step', disable=self.training_args.local_rank not in [-1, 0]): + optim.zero_grad() + loss = self.model(**batch)[0] + loss.backward() + optim.step() + + with self.track('--------Save Fine-Tuned Model'): + torch.save(self.model.state_dict(), self.training_args.output_dir + "/pytorch_model.bin") + + def _do_infer(self): + if self.training_args.do_predict: + with self.track('Inference'): + batch_size = self.training_args.per_device_eval_batch_size + all_outputs, all_labels = [], [] + + def prediction_step(batch, labels): + all_labels.extend(labels) + inputs = batch + output = self.model(**inputs) + all_outputs.append(output['logits'].detach().cpu()) + + self.model.eval() + with torch.no_grad(): + test_sampler = RandomSampler( + self.test_data) if self.training_args.local_rank == -1 else DistributedSampler( + self.test_data) + + for batch in tqdm(DataLoader(self.test_data, sampler=test_sampler, batch_size=batch_size, + collate_fn=DataCollatorWithPadding(self.tokenizer)), + desc='Test Step'): + prediction_step(batch=batch, labels=batch.pop('labels')) + acc = compute_metrics(PredsLabels(preds=np.concatenate(all_outputs), labels=all_labels)) + print(f"\n*********** TEST_METRICS ***********\nAccuracy: {acc['acc']}\n") diff --git a/profiling-transformers/src/finetune_trainer.py b/profiling-transformers/src/finetune_trainer.py new file mode 100644 index 0000000..e7f8d71 --- /dev/null +++ b/profiling-transformers/src/finetune_trainer.py @@ -0,0 +1,59 @@ +# Copyright (C) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# + + +from transformers import ( + AutoModelForSequenceClassification, + Trainer +) + +from finetune import DlsaFinetune +from utils import compute_metrics, save_test_metrics, save_train_metrics + + + +class FinetuneTrainer(DlsaFinetune): + def _load_data(self): + return super()._load_data() + + def _preprocess(self): + return super()._preprocess() + + def _load_model(self): + with self.track('Load Model'): + self.model = AutoModelForSequenceClassification.from_pretrained(self.args.model_name_or_path) + + self.trainer = Trainer( + model=self.model, # the instantiated HF model to be trained + args=self.training_args, # training arguments, defined above + train_dataset=self.train_data, # training dataset + compute_metrics=compute_metrics, # evaluation metrics + tokenizer=self.tokenizer + ) + + def _do_finetune(self): + if self.training_args.do_train: + with self.track('Fine-Tune'): + train_result = self.trainer.train() + self.trainer.save_model() + save_train_metrics(train_result, self.trainer, len(self.train_data)) + + def _do_infer(self): + test_metrics = "" + if self.training_args.do_predict: + with self.track('Inference'): + preds, _, metrics = self.trainer.predict(self.test_data) + test_metrics = save_test_metrics(metrics, len(self.test_data), self.training_args.output_dir) + print(test_metrics) diff --git a/profiling-transformers/src/infer.py b/profiling-transformers/src/infer.py new file mode 100644 index 0000000..7da7e43 --- /dev/null +++ b/profiling-transformers/src/infer.py @@ -0,0 +1,91 @@ +# Copyright (C) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# + + +from datasets import load_dataset +from transformers import AutoTokenizer + +from utils import Benchmark + + + +class DlsaInference(object): + def __init__(self, **kwargs): + self.args = kwargs['args'] + self.training_args = kwargs['training_args'] + + self.max_train, self.max_test = self.args.max_train_samples, self.args.max_test_samples + if self.args.smoke_test: + self.max_train, self.max_test = 100, 100 + + self.bench = Benchmark() + self.track = self.bench.track + + def e2e_infer(self): + with self.track('Total Run'): + self._load_data() + self._preprocess() + self._load_model() + self._do_infer() + self.bench.summary() + + def _load_data(self): + with self.track('Load Data'): + data = load_dataset(self.args.dataset) + test_split = 'validation' if self.args.dataset == 'sst2' else 'test' + if self.args.multi_instance: + start_index = (self.args.instance_index - 1) * self.args.max_test_samples + end_index = self.args.instance_index * self.args.max_test_samples + self.test_data = data[test_split].select(range(start_index, end_index)) + print("start_index is ", start_index) + print("end_index is ", end_index) + print("test length is ", len(self.test_data)) + else: + self.test_data = data[test_split].select(range(self.max_test)) if self.max_test else data[test_split] + + self.text_column = [c for c in self.test_data.column_names if type(self.test_data[c][0]) != int][0] + + def _preprocess(self): + with self.track('Pre-process'): + with self.track('----Init tokenizer'): + self.tokenizer = AutoTokenizer.from_pretrained( + self.args.tokenizer_name if self.args.tokenizer_name else self.args.model_name_or_path + ) + + self.max_seq_len = min(self.args.max_seq_len, self.tokenizer.model_max_length) + + with self.track('----Tokenize + Extract Features'): + def preprocess(examples): + return self.tokenizer( + examples[self.text_column], + padding='max_length', + truncation=True, + max_length=self.max_seq_len + ) + + kwargs = dict( + function=preprocess, + batched=True, + num_proc=self.args.preprocessing_num_workers, + remove_columns=[self.text_column] + (['idx'] if self.args.dataset == 'sst2' else []), + load_from_cache_file=not self.args.overwrite_cache) + + self.test_data = self.test_data.map(**kwargs) if self.training_args.do_predict else None + + def _load_model(self): + raise NotImplementedError + + def _do_infer(self): + raise NotImplementedError diff --git a/profiling-transformers/src/infer_ipex.py b/profiling-transformers/src/infer_ipex.py new file mode 100644 index 0000000..d00c530 --- /dev/null +++ b/profiling-transformers/src/infer_ipex.py @@ -0,0 +1,136 @@ +# Copyright (C) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# + +import os + +import intel_extension_for_pytorch as ipex +import numpy as np +import torch +from torch import tensor +from torch.utils.data import DataLoader +from tqdm import tqdm +from transformers import ( + AutoModelForSequenceClassification, + DataCollatorWithPadding +) + +from infer import DlsaInference +from utils import compute_metrics, PredsLabels + + +class IpexInfer(DlsaInference): + def _load_data(self): + return super()._load_data() + + def _preprocess(self): + return super()._preprocess() + + def _load_model(self): + with self.track('Load Model'): + self.model = AutoModelForSequenceClassification.from_pretrained(self.args.model_name_or_path) + + if self.args.dtype_inf == 'fp32': + self.model = ipex.optimize(self.model, dtype=torch.float32, level='O1') + + elif self.args.dtype_inf == 'bf16': + with self.track("Process bf16 model"): + self.model = ipex.optimize(self.model, dtype=torch.bfloat16, level='O0') + dumpy_tensor = torch.ones((self.training_args.per_device_eval_batch_size, self.max_seq_len), + dtype=torch.long) + jit_inputs = (dumpy_tensor, dumpy_tensor, dumpy_tensor) + with torch.cpu.amp.autocast(), torch.no_grad(): + self.model = torch.jit.trace(self.model, jit_inputs, strict=False) + self.model = torch.jit.freeze(self.model) + with torch.no_grad(): + y = self.model(dumpy_tensor, dumpy_tensor, dumpy_tensor) + y = self.model(dumpy_tensor, dumpy_tensor, dumpy_tensor) + + elif self.args.dtype_inf == 'int8': + with self.track("Process int8 model"): + # convert fp32 model to int 8 + dumpy_tensor = torch.ones((self.training_args.per_device_eval_batch_size, self.max_seq_len), + dtype=torch.long) + jit_inputs = (dumpy_tensor, dumpy_tensor, dumpy_tensor) + + if os.path.exists(self.args.model_name_or_path + "/quantized_model.pt"): + print("load int8 model-----------------------") + with torch.cpu.amp.autocast(): + self.model = torch.jit.load(self.args.model_name_or_path + "/quantized_model.pt") + self.model = torch.jit.freeze(self.model.eval()) + else: + print("load configure and convert the model") + ipex.nn.utils._model_convert.replace_dropout_with_identity(self.model) + from intel_extension_for_pytorch.quantization import prepare, convert + from torch.ao.quantization import MinMaxObserver, PerChannelMinMaxObserver, QConfig + qconfig = QConfig( + activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_affine, dtype=torch.quint8), + weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8, + qscheme=torch.per_channel_symmetric)) + prepared_model = prepare(self.model, qconfig, example_inputs=jit_inputs, inplace=False) + prepared_model.load_qconf_summary( + qconf_summary=self.args.model_name_or_path + "/int8_configure.json") + with torch.cpu.amp.autocast(): + self.model = convert(prepared_model) + self.model = torch.jit.trace(self.model, jit_inputs, strict=False) + self.model = torch.jit.freeze(self.model) + + with torch.no_grad(): + y = self.model(dumpy_tensor, dumpy_tensor, dumpy_tensor) + y = self.model(dumpy_tensor, dumpy_tensor, dumpy_tensor) + + else: + error_msg = f'Now only support fp32, bf16 and int8.Your input datatype is {self.args.dtype_inf}.' + raise ValueError(error_msg) + + def _do_infer(self): + if self.training_args.do_predict: + with self.track('Inference'): + batch_size = self.training_args.per_device_eval_batch_size + all_outputs, all_labels = [], [] + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + def to_inputs(batch: dict) -> dict: + return {k: (v if torch.is_tensor(v) else tensor(v)).to(device=device) for k, v in batch.items()} + + def prediction_step(batch, labels): + all_labels.extend(labels) + inputs = to_inputs(batch) + output = self.model(**inputs) + all_outputs.append(output['logits'].detach().cpu()) + + self.model.eval() + + with torch.no_grad(): + if self.args.profiler: + with torch.profiler.profile( + schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=2), + on_trace_ready=torch.profiler.tensorboard_trace_handler( + './profiler/' + self.args.profiler_name), + record_shapes=True, + profile_memory=True, + with_stack=True + ) as prof: + for batch in tqdm(DataLoader(self.test_data, batch_size=batch_size, + collate_fn=DataCollatorWithPadding(self.tokenizer))): + prediction_step(batch=batch, labels=batch.pop('labels')) + prof.step() + else: + for batch in tqdm(DataLoader(self.test_data, batch_size=batch_size, + collate_fn=DataCollatorWithPadding(self.tokenizer))): + prediction_step(batch=batch, labels=batch.pop('labels')) + + acc = compute_metrics(PredsLabels(preds=np.concatenate(all_outputs), labels=all_labels)) + print(f"\n*********** TEST_METRICS ***********\nAccuracy: {acc['acc']}\n") diff --git a/profiling-transformers/src/infer_trainer.py b/profiling-transformers/src/infer_trainer.py new file mode 100644 index 0000000..aa3607d --- /dev/null +++ b/profiling-transformers/src/infer_trainer.py @@ -0,0 +1,50 @@ +# Copyright (C) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# + + +from transformers import ( + AutoModelForSequenceClassification, + Trainer +) + +from infer import DlsaInference +from utils import compute_metrics, save_test_metrics + + +class TrainerInfer(DlsaInference): + def _load_data(self): + return super()._load_data() + + def _preprocess(self): + return super()._preprocess() + + def _load_model(self): + with self.track('Load Model'): + self.model = AutoModelForSequenceClassification.from_pretrained(self.args.model_name_or_path) + + self.trainer = Trainer( + model=self.model, # the instantiated HF model to be trained + args=self.training_args, # training arguments, defined above + compute_metrics=compute_metrics, # evaluation metrics + tokenizer=self.tokenizer + ) + + def _do_infer(self): + test_metrics = "" + if self.training_args.do_predict: + with self.track('Inference'): + preds, _, metrics = self.trainer.predict(self.test_data) + test_metrics = save_test_metrics(metrics, len(self.test_data), self.training_args.output_dir) + print(test_metrics) diff --git a/profiling-transformers/src/run_finetune.py b/profiling-transformers/src/run_finetune.py new file mode 100644 index 0000000..8f0e874 --- /dev/null +++ b/profiling-transformers/src/run_finetune.py @@ -0,0 +1,55 @@ +# Copyright (C) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# + + +from transformers import HfArgumentParser, TrainingArguments +from transformers import logging as hf_logging + +from utils import Arguments + +hf_logging.set_verbosity_info() + + +def main(): + parser = HfArgumentParser((Arguments, TrainingArguments)) + args, training_args = parser.parse_args_into_dataclasses() + kwargs = {'args': args, 'training_args': training_args} + + if args.finetune_impl == 'trainer': + from finetune_trainer import FinetuneTrainer + finetune = FinetuneTrainer(**kwargs) + elif args.finetune_impl == 'ipex': + from finetune_ipex import FinetuneIpex + finetune = FinetuneIpex(**kwargs) + elif args.finetune_impl == 'ipex_ccl': + from finetune_ipex_dist import FinetuneIpexDist + finetune = FinetuneIpexDist(**kwargs) + elif args.finetune_impl == 'tpp': + from finetune_tpp import FinetuneTpp + finetune = FinetuneTpp(**kwargs) + elif args.finetune_impl == 'tpp_ccl': + from finetune_tpp_dist import FinetuneTppDist + finetune = FinetuneTppDist(**kwargs) + else: + error_msg = f'Now only support trainer, ipex, ipex_ccl, tpp and tpp_ccl implementations ' \ + f'for DLSA fine-tuning pipeline. ' \ + f'Your input datatype is {args.finetune_impl}.' + raise ValueError(error_msg) + + finetune.e2e_finetune() + + +if __name__ == '__main__': + main() diff --git a/profiling-transformers/src/run_infer.py b/profiling-transformers/src/run_infer.py new file mode 100644 index 0000000..7a7f0b2 --- /dev/null +++ b/profiling-transformers/src/run_infer.py @@ -0,0 +1,43 @@ +# Copyright (C) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# + + +from transformers import HfArgumentParser, TrainingArguments +from utils import Arguments +from transformers import logging as hf_logging + +hf_logging.set_verbosity_info() + + +def main(): + parser = HfArgumentParser((Arguments, TrainingArguments)) + args, training_args = parser.parse_args_into_dataclasses() + kwargs = {'args': args, 'training_args': training_args} + + if args.infer_impl == 'trainer': + from infer_trainer import TrainerInfer + infer = TrainerInfer(**kwargs) + elif args.infer_impl == 'ipex': + from infer_ipex import IpexInfer + infer = IpexInfer(**kwargs) + else: + error_msg = f'Now only support trainer and ipex implementation for DLSA inference pipeline.' + raise ValueError(error_msg) + + infer.e2e_infer() + + +if __name__ == '__main__': + main() diff --git a/profiling-transformers/src/run_pt.py b/profiling-transformers/src/run_pt.py deleted file mode 100644 index 7e5010b..0000000 --- a/profiling-transformers/src/run_pt.py +++ /dev/null @@ -1,137 +0,0 @@ -# Copyright (C) 2022 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions -# and limitations under the License. -# - -# - -import logging - -from datasets import load_dataset -from transformers import ( - logging as hf_logging, - HfArgumentParser, - AutoTokenizer, - AutoModelForSequenceClassification, - Trainer, - TrainingArguments, -) - -from utils import ( - Arguments, - Benchmark, - compute_metrics, - save_train_metrics, - save_test_metrics, - check_dataset -) - -hf_logging.set_verbosity_info() -logger = logging.getLogger(__name__) - - -def main(): - # See all possible arguments in src/transformers/training_args.py - # or by passing the --help flag to this script. - parser = HfArgumentParser((Arguments, TrainingArguments)) - args, training_args = parser.parse_args_into_dataclasses() - - max_train, max_test = args.max_train_samples, args.max_test_samples - if args.smoke_test: - training_args.max_steps = 3 - max_train, max_test = 10, 10 - - bench = Benchmark() - track = bench.track - with track('Total Run'): - ############################ Load Data #################################### - with track('Load Data'): - data = load_dataset(*check_dataset(args.dataset)) - train_all = data['train'] - test_split = 'validation' if args.dataset == 'sst2' else 'test' - len_train = len(train_all) - train_data = train_all.select(range(len_train - max_train, len_train)) if max_train else train_all - - # split the Test Data for multi-instance - if args.multi_instance: - start_index = (args.instance_index - 1) * args.max_test_samples - end_index = args.instance_index * args.max_test_samples - test_data = data[test_split].select(range(start_index, end_index)) - print("start_index is ", start_index) - print("end_index is ", end_index) - print("test length is ", len(test_data)) - else: - test_data = data[test_split].select(range(max_test)) if max_test else data[test_split] - - text_column = [c for c in test_data.column_names if type(test_data[c][0]) != int][0] - - ############################### Pre-process ############################### - with track('Pre-process'): - with track('----Init tokenizer'): - tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer_name if args.tokenizer_name else args.model_name_or_path - ) - - max_seq_len = min(args.max_seq_len, tokenizer.model_max_length) - - with track('----Tokenize + Extract Features'): - def preprocess(examples): - return tokenizer( - examples[text_column], - padding='max_length', - truncation=True, - max_length=max_seq_len - ) - - kwargs = dict( - function=preprocess, - batched=True, - num_proc=args.preprocessing_num_workers, - remove_columns=[text_column] + (['idx'] if args.dataset == 'sst2' else []), - load_from_cache_file=not args.overwrite_cache) - - train_data = train_data.map(**kwargs) if training_args.do_train else None - test_data = test_data.map(**kwargs) if training_args.do_predict else None - - ###################### Load Model and Trainer ############################ - with track('Load Model'): - model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_path) - - trainer = Trainer( - model=model, # the instantiated HF model to be trained - args=training_args, # training arguments, defined above - train_dataset=train_data, # training dataset - compute_metrics=compute_metrics, # evaluation metrics - tokenizer=tokenizer - ) - - ############################### Fine-Tune ################################# - if training_args.do_train: - with track('Fine-Tune'): - train_result = trainer.train() - trainer.save_model() - save_train_metrics(train_result, trainer, len(train_data)) - - ############################### Inference ################################# - test_metrics = "" - if training_args.do_predict: - with track('Inference'): - preds, _, metrics = trainer.predict(test_data) - test_metrics = save_test_metrics(metrics, len(test_data), training_args.output_dir) - - bench.summary() - print(test_metrics) - - -if __name__ == "__main__": - main() diff --git a/profiling-transformers/src/run_pt_native.py b/profiling-transformers/src/run_pt_native.py deleted file mode 100644 index 108abaf..0000000 --- a/profiling-transformers/src/run_pt_native.py +++ /dev/null @@ -1,249 +0,0 @@ -# Copyright (C) 2022 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions -# and limitations under the License. -# - -# - -from pathlib import Path -import os -import logging -from tqdm import tqdm - -import numpy as np -import torch -from torch.utils.data import DataLoader -from torch import tensor - -try: - import intel_extension_for_pytorch as ipex -finally: - pass - -import transformers -from transformers import ( - HfArgumentParser, - AutoTokenizer, - AutoModelForSequenceClassification, - TrainingArguments, -) - -from utils import ( - Arguments, - read_dataset, - to_tensor_dataset, - Benchmark, - compute_metrics, - PredsLabels -) - -transformers.logging.set_verbosity_info() - -logger = logging.getLogger(__name__) - - -def main(): - # See all possible arguments in src/transformers/training_args.py - # or by passing the --help flag to this script. - - parser = HfArgumentParser((Arguments, TrainingArguments)) - args, training_args = parser.parse_args_into_dataclasses() - output_dir = Path(training_args.output_dir) - os.makedirs(output_dir, exist_ok=True) - bench = Benchmark() - track = bench.track - - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - - def to_inputs(batch: dict) -> dict: - return {k: (v if torch.is_tensor(v) else tensor(v)).to(device=device) \ - for k, v in batch.items()} - - ################################# Load Data ################################# - - with track('Load Data'): - if training_args.do_train: - # Train Data - train_texts, train_labels = read_dataset(args.dataset, 'train') - max_train = args.max_train_samples if args.max_train_samples else len(train_texts) - if args.smoke_test: - training_args.max_steps = 3 - training_args.num_train_epochs = 1 - max_train = 104 - train_texts, train_labels = train_texts[:max_train], train_labels[:max_train] - - if training_args.do_predict: - max_test = 100 if args.smoke_test else (args.max_test_samples if args.max_test_samples else None) - - if not args.real_time: - # Test Data - test_texts, test_labels = read_dataset(args.dataset, 'test') - if args.multi_instance: - start_index = (args.instance_index - 1) * args.max_test_samples - end_index = args.instance_index * args.max_test_samples - test_texts, test_labels = test_texts[start_index:end_index], test_labels[start_index:end_index] - print("start_index is ", start_index) - print("end_index is ", end_index) - print("test text length is ", len(test_texts)) - print("test labels length is ", len(test_labels)) - else: - test_texts, test_labels = test_texts[:max_test], test_labels[:max_test] - - ################################# Pre-process ################################# - with track('Pre-process'): - with track('----Init tokenizer'): - # Tokenization + Feature Extraction - tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer_name if args.tokenizer_name else args.model_name_or_path - ) - max_seq_len = min(args.max_seq_len, tokenizer.model_max_length) - token_args = dict(truncation=True, padding=True, max_length=max_seq_len) - - if training_args.do_train: - with track('----Training data encoding'): - train_encodings = tokenizer(train_texts, **token_args) - with track('----Training tensor data convert'): - train_dataset = to_tensor_dataset('pt', train_encodings, train_labels) - - if training_args.do_predict and not args.real_time: - with track('----PyTorch test data encoding'): - test_encodings = tokenizer(test_texts, padding='max_length', max_length=max_seq_len, - truncation=True) - with track('----PyTorch test tensor data convert'): - test_dataset = to_tensor_dataset('pt', test_encodings, test_labels) - - ################################# Load Model ################################# - if training_args.do_train or not args.torchscript: - with track('Load Model'): - if args.bf16_ipex_ft: - with torch.cpu.amp.autocast(): - model = AutoModelForSequenceClassification \ - .from_pretrained(args.model_name_or_path) \ - .to(device=device) - model = ipex.optimize(model, dtype=torch.bfloat16, level='O0') - else: - model = AutoModelForSequenceClassification \ - .from_pretrained(args.model_name_or_path) \ - .to(device=device) - - if args.fp32_ipex_ft: - model = ipex.optimize(model, dtype=torch.float32, level='O1') - - with track("Process int8 model"): - if args.int8: - # convert fp32 model to int8 - ipex.nn.utils._model_convert.replace_dropout_with_identity(model) - conf = ipex.quantization.QuantConf(configure_file=args.model_name_or_path + "/configure.json") - dumpy_tensor = torch.ones((training_args.per_device_eval_batch_size, max_seq_len), dtype=torch.long) - jit_inputs = (dumpy_tensor, dumpy_tensor, dumpy_tensor) - if args.int8_bf16: - with torch.cpu.amp.autocast(): - model = ipex.quantization.convert(model, conf, jit_inputs) - else: - model = ipex.quantization.convert(model, conf, jit_inputs) - with torch.no_grad(): - y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor) - y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor) - - with track("Process bf16 model"): - if args.ipex_bf16: - # convert fp32 model to bf16 - with torch.cpu.amp.autocast(), torch.no_grad(): - torch.jit.load('imdb_bf16model.pt') - model = ipex.optimize(model, dtype=torch.bfloat16, level='O0') - dumpy_tensor = torch.ones((training_args.per_device_eval_batch_size, max_seq_len), dtype=torch.long) - jit_inputs = (dumpy_tensor, dumpy_tensor, dumpy_tensor) - with torch.cpu.amp.autocast(), torch.no_grad(): - model = torch.jit.trace(model, jit_inputs, strict=False) - model = torch.jit.freeze(model) - with torch.no_grad(): - y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor) - y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor) - - ################################ Fine-Tune ################################# - if training_args.do_train: - with track('Fine-Tune'): - with track('--------Init Fine-Tuning'): - batch_size = training_args.per_device_train_batch_size - model.train() - weight_decay = 0.0 - no_decay = ["bias", "LayerNorm.weight"] - optimizer_grouped_parameters = [ - { - "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], - "weight_decay": weight_decay, - }, - { - "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], - "weight_decay": 0.0, - }, - ] - optim = torch.optim.AdamW(optimizer_grouped_parameters, lr=training_args.learning_rate) - - with track('--------Training Loop'): - for _ in tqdm(range(int(training_args.num_train_epochs)), desc='Epoch'): - for batch in tqdm(DataLoader(train_dataset, batch_size=batch_size, shuffle=True), - desc='Train Step'): - optim.zero_grad() - loss = model(**to_inputs(batch))[0] - loss.backward() - optim.step() - - with track('--------Save Fine-Tuned Model'): - if args.torchscript: - with track('--------Save TorchScript model'): - model.eval() - batch = to_inputs(batch) - traced_model = torch.jit.trace(model, [batch['input_ids'], batch['attention_mask']]) - torch.jit.save(traced_model, output_dir / "traced_model.pt") - else: - torch.save(model.state_dict(), output_dir / "pytorch_model.bin") - - ############################### Inference ################################# - if training_args.do_predict: - with track('Inference'): - if args.torchscript: - with track('--------Load TorchScript model'): - model_path = output_dir if training_args.do_train else Path(args.model_name_or_path) - model = torch.jit.load(model_path / "traced_model.pt").to(device=device) - - batch_size = training_args.per_device_eval_batch_size - all_outputs, all_labels = [], [] - - def prediction_step(batch, labels): - all_labels.extend(labels) - inputs = to_inputs(batch) - output = model(inputs['input_ids'], inputs['attention_mask']) if args.torchscript \ - else model(**inputs) - all_outputs.append(output['logits'].detach().cpu()) - - model.eval() - with torch.no_grad(): - if args.real_time: - data_generator = read_dataset(args.dataset, 'test', generator=True, \ - batch_size=batch_size, max_samples=max_test) - - for texts, labels in tqdm(data_generator, desc='Test Step'): - prediction_step(batch=tokenizer(texts, **token_args), labels=labels) - - else: - for batch in tqdm(DataLoader(test_dataset, batch_size=batch_size), desc='Test Step'): - prediction_step(batch=batch, labels=batch.pop('labels')) - acc = compute_metrics(PredsLabels(preds=np.concatenate(all_outputs), labels=all_labels)) - print(f"\n*********** TEST_METRICS ***********\nAccuracy: {acc['acc']}\n") - - bench.summary() - - -if __name__ == "__main__": - main() diff --git a/profiling-transformers/src/run_pt_native_ft.py b/profiling-transformers/src/run_pt_native_ft.py deleted file mode 100644 index 1040ced..0000000 --- a/profiling-transformers/src/run_pt_native_ft.py +++ /dev/null @@ -1,286 +0,0 @@ -# Copyright (C) 2022 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions -# and limitations under the License. -# - -# - -from pathlib import Path -import os -import logging -from tqdm import tqdm - -import numpy as np -import torch -from torch.utils.data import DataLoader, RandomSampler -from torch.utils.data.distributed import DistributedSampler -from torch import tensor - -try: - import intel_extension_for_pytorch as ipex -finally: - pass - -import transformers -from transformers import ( - HfArgumentParser, - AutoTokenizer, - AutoModelForSequenceClassification, - TrainingArguments, - set_seed, -) - -from utils import ( - Arguments, - read_dataset, - to_tensor_dataset, - Benchmark, - compute_metrics, - PredsLabels -) - -transformers.logging.set_verbosity_info() - -logger = logging.getLogger(__name__) - - -def main(): - # See all possible arguments in src/transformers/training_args.py - # or by passing the --help flag to this script. - - parser = HfArgumentParser((Arguments, TrainingArguments)) - args, training_args = parser.parse_args_into_dataclasses() - output_dir = Path(training_args.output_dir) - os.makedirs(output_dir, exist_ok=True) - bench = Benchmark() - track = bench.track - - set_seed(training_args.seed) - - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - - if int(os.environ.get('PMI_SIZE', '0')) > 1 and not args.multi_instance: - if args.dist_backend == 'ccl': - try: - import oneccl_bindings_for_pytorch - except: - print("CCL backend requested but import oneccl_bindings_for_pytorch failed") - raise - elif args.dist_backend == 'mpi': - if not torch.distributed.is_mpi_available(): - try: - import torch_mpi - except: - print("MPI backend requested but not available try installing torch_mpi module") - raise - else: - raise ValueError(f"{args.dist_backend} backend requested but not supported") - - os.environ['RANK'] = os.environ.get('PMI_RANK', '0') - os.environ['WORLD_SIZE'] = os.environ.get('PMI_SIZE', '1') - torch.distributed.init_process_group(backend=args.dist_backend) - device = torch.device("cpu") - training_args.local_rank = torch.distributed.get_rank() - if training_args.local_rank == 0: print(f"##################Using {args.dist_backend.upper()} dist run with {torch.distributed.get_world_size()} ranks", flush=True) - - def to_inputs(batch: dict) -> dict: - return {k: (v if torch.is_tensor(v) else tensor(v)).to(device=device) \ - for k, v in batch.items()} - - ################################# Load Data ################################# - - with track('Load Data'): - if training_args.do_train: - # Train Data - train_texts, train_labels = read_dataset(args.dataset, 'train') - max_train = args.max_train_samples if args.max_train_samples else len(train_texts) - if args.smoke_test: - training_args.max_steps = 3 - training_args.num_train_epochs = 1 - max_train = 104 - train_texts, train_labels = train_texts[:max_train], train_labels[:max_train] - - if training_args.do_predict: - max_test = 100 if args.smoke_test else (args.max_test_samples if args.max_test_samples else None) - - if not args.real_time: - # Test Data - test_texts, test_labels = read_dataset(args.dataset, 'test') - if args.multi_instance: - start_index = (args.instance_index - 1) * args.max_test_samples - end_index = args.instance_index * args.max_test_samples - test_texts, test_labels = test_texts[start_index:end_index], test_labels[start_index:end_index] - print("start_index is ", start_index) - print("end_index is ", end_index) - print("test text length is ", len(test_texts)) - print("test labels length is ", len(test_labels)) - else: - test_texts, test_labels = test_texts[:max_test], test_labels[:max_test] - - ################################# Pre-process ################################# - with track('Pre-process'): - with track('----Init tokenizer'): - # Tokenization + Feature Extraction - tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer_name if args.tokenizer_name else args.model_name_or_path - ) - max_seq_len = min(args.max_seq_len, tokenizer.model_max_length) - token_args = dict(truncation=True, padding=True, max_length=max_seq_len) - - if training_args.do_train: - with track('----Training data encoding'): - train_encodings = tokenizer(train_texts, **token_args) - with track('----Training tensor data convert'): - train_dataset = to_tensor_dataset('pt', train_encodings, train_labels) - - if training_args.do_predict and not args.real_time: - with track('----PyTorch test data encoding'): - test_encodings = tokenizer(test_texts, padding='max_length', max_length=max_seq_len, - truncation=True) - with track('----PyTorch test tensor data convert'): - test_dataset = to_tensor_dataset('pt', test_encodings, test_labels) - - ################################# Load Model ################################# - if training_args.do_train or not args.torchscript: - with track('Load Model'): - if args.bf16_ipex_ft: - with torch.cpu.amp.autocast(): - model = AutoModelForSequenceClassification \ - .from_pretrained(args.model_name_or_path) \ - .to(device=device) - model = ipex.optimize(model, dtype=torch.bfloat16, level='O0') - else: - model = AutoModelForSequenceClassification \ - .from_pretrained(args.model_name_or_path) \ - .to(device=device) - #model = AutoModelForSequenceClassification \ - # .from_pretrained(args.model_name_or_path) \ - # .to(device=device) - - with track("Process int8 model"): - if args.int8: - # convert fp32 model to int8 - ipex.nn.utils._model_convert.replace_dropout_with_identity(model) - conf = ipex.quantization.QuantConf(configure_file=args.model_name_or_path + "/configure.json") - dumpy_tensor = torch.ones((training_args.per_device_eval_batch_size, max_seq_len), dtype=torch.long) - jit_inputs = (dumpy_tensor, dumpy_tensor, dumpy_tensor) - if args.int8_bf16: - with torch.cpu.amp.autocast(): - model = ipex.quantization.convert(model, conf, jit_inputs) - else: - model = ipex.quantization.convert(model, conf, jit_inputs) - with torch.no_grad(): - y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor) - y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor) - - with track("Process bf16 model"): - if args.ipex_bf16: - # convert fp32 model to bf16 - with torch.cpu.amp.autocast(), torch.no_grad(): - torch.jit.load('imdb_bf16model.pt') - model = ipex.optimize(model, dtype=torch.bfloat16, level='O0') - dumpy_tensor = torch.ones((training_args.per_device_eval_batch_size, max_seq_len), dtype=torch.long) - jit_inputs = (dumpy_tensor, dumpy_tensor, dumpy_tensor) - with torch.cpu.amp.autocast(), torch.no_grad(): - model = torch.jit.trace(model, jit_inputs, strict=False) - model = torch.jit.freeze(model) - with torch.no_grad(): - y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor) - y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor) - - ################################ Fine-Tune ################################# - if training_args.do_train: - with track('Fine-Tune'): - with track('--------Init Fine-Tuning'): - batch_size = training_args.per_device_train_batch_size - model.train() - weight_decay = 0.0 - no_decay = ["bias", "LayerNorm.weight"] - optimizer_grouped_parameters = [ - { - "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], - "weight_decay": weight_decay, - }, - { - "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], - "weight_decay": 0.0, - }, - ] - optim = torch.optim.AdamW(optimizer_grouped_parameters, lr=training_args.learning_rate) - if training_args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel(model) - - with track('--------Training Loop'): - train_sampler = RandomSampler(train_dataset) if training_args.local_rank == -1 else DistributedSampler(train_dataset) - - for _ in tqdm(range(int(training_args.num_train_epochs)), desc='Epoch', disable=training_args.local_rank not in [-1, 0]): - for batch in tqdm(DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size), - desc='Train Step', disable=training_args.local_rank not in [-1, 0]): - optim.zero_grad() - loss = model(**to_inputs(batch))[0] - loss.backward() - optim.step() - - with track('--------Save Fine-Tuned Model'): - if training_args.local_rank in [-1, 0]: - # Take care of DDP wrapper - model_to_save = model.module if hasattr(model, "module") else model - if args.torchscript: - with track('--------Save TorchScript model'): - model.eval() - batch = to_inputs(batch) - traced_model = torch.jit.trace(model_to_save, [batch['input_ids'], batch['attention_mask']]) - torch.jit.save(traced_model, output_dir / "traced_model.pt") - else: - torch.save(model_to_save.state_dict(), output_dir / "pytorch_model.bin") - - ############################### Inference ################################# - if training_args.do_predict: - with track('Inference'): - if args.torchscript: - with track('--------Load TorchScript model'): - model_path = output_dir if training_args.do_train else Path(args.model_name_or_path) - model = torch.jit.load(model_path / "traced_model.pt").to(device=device) - - batch_size = training_args.per_device_eval_batch_size - all_outputs, all_labels = [], [] - - def prediction_step(batch, labels): - all_labels.extend(labels) - inputs = to_inputs(batch) - output = model(inputs['input_ids'], inputs['attention_mask']) if args.torchscript \ - else model(**inputs) - all_outputs.append(output['logits'].detach().cpu()) - - model.eval() - with torch.no_grad(): - if args.real_time: - data_generator = read_dataset(args.dataset, 'test', generator=True, \ - batch_size=batch_size, max_samples=max_test) - - for texts, labels in tqdm(data_generator, desc='Test Step'): - prediction_step(batch=tokenizer(texts, **token_args), labels=labels) - - else: - test_sampler = RandomSampler(test_dataset) if training_args.local_rank == -1 else DistributedSampler(test_dataset) - - for batch in tqdm(DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size), desc='Test Step'): - prediction_step(batch=batch, labels=batch.pop('labels')) - acc = compute_metrics(PredsLabels(preds=np.concatenate(all_outputs), labels=all_labels)) - print(f"\n*********** TEST_METRICS ***********\nAccuracy: {acc['acc']}\n") - - bench.summary() - - -if __name__ == "__main__": - main() diff --git a/profiling-transformers/src/run_pt_native_inf.py b/profiling-transformers/src/run_pt_native_inf.py deleted file mode 100644 index 935906f..0000000 --- a/profiling-transformers/src/run_pt_native_inf.py +++ /dev/null @@ -1,223 +0,0 @@ -# Copyright (C) 2022 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions -# and limitations under the License. -# - -# - -import logging -import os - -import numpy as np -import torch -from datasets import load_dataset -from torch import tensor -from torch.utils.data import DataLoader -from tqdm import tqdm - -try: - import intel_extension_for_pytorch as ipex -finally: - pass - -from transformers import ( - logging as hf_logging, - HfArgumentParser, - AutoTokenizer, - AutoModelForSequenceClassification, - TrainingArguments, - DataCollatorWithPadding -) - -from utils import ( - Arguments, - Benchmark, - compute_metrics, - PredsLabels, - check_dataset -) - -hf_logging.set_verbosity_info() -logger = logging.getLogger(__name__) - - -def main(): - # See all possible arguments in src/transformers/training_args.py - # or by passing the --help flag to this script. - parser = HfArgumentParser((Arguments, TrainingArguments)) - args, training_args = parser.parse_args_into_dataclasses() - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - - max_train, max_test = args.max_train_samples, args.max_test_samples - if args.smoke_test: - training_args.max_steps = 3 - max_train, max_test = 10, 10 - - bench = Benchmark() - track = bench.track - with track('Total Run'): - ############################ Load Data #################################### - with track('Load Data'): - data = load_dataset(*check_dataset(args.dataset)) - train_all = data['train'] - test_split = 'validation' if args.dataset == 'sst2' else 'test' - len_train = len(train_all) - train_data = train_all.select(range(len_train - max_train, len_train)) if max_train else train_all - - # split the Test Data for multi-instance - if args.multi_instance: - start_index = (args.instance_index - 1) * args.max_test_samples - end_index = args.instance_index * args.max_test_samples - test_data = data[test_split].select(range(start_index, end_index)) - print("start_index is ", start_index) - print("end_index is ", end_index) - print("test length is ", len(test_data)) - else: - test_data = data[test_split].select(range(max_test)) if max_test else data[test_split] - - text_column = [c for c in test_data.column_names if type(test_data[c][0]) != int][0] - - ############################### Pre-process ############################### - with track('Pre-process'): - with track('----Init tokenizer'): - tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer_name if args.tokenizer_name else args.model_name_or_path - ) - - max_seq_len = min(args.max_seq_len, tokenizer.model_max_length) - - with track('----Tokenize + Extract Features'): - def preprocess(examples): - return tokenizer( - examples[text_column], - padding='max_length', - truncation=True, - max_length=max_seq_len - ) - - kwargs = dict( - function=preprocess, - batched=True, - num_proc=args.preprocessing_num_workers, - remove_columns=[text_column] + (['idx'] if args.dataset == 'sst2' else []), - load_from_cache_file=not args.overwrite_cache) - - train_data = train_data.map(**kwargs) if training_args.do_train else None - test_data = test_data.map(**kwargs) if training_args.do_predict else None - - ###################### Load Model and Trainer ############################ - with track('Load Model'): - model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_path).to(device=device) - - with track("Process int8 model"): - if args.int8: - # convert fp32 model to int 8 - dumpy_tensor = torch.ones((training_args.per_device_eval_batch_size, max_seq_len), dtype=torch.long) - jit_inputs = (dumpy_tensor, dumpy_tensor, dumpy_tensor) - - if os.path.exists(args.model_name_or_path + "/quantized_model.pt"): - print("load int8 model-----------------------") - if args.int8_bf16: - with torch.cpu.amp.autocast(): - model = torch.jit.load(args.model_name_or_path + "/quantized_model.pt") - model = torch.jit.freeze(model.eval()) - else: - model = torch.jit.load(args.model_name_or_path + "/quantized_model.pt") - model = torch.jit.freeze(model.eval()) - else: - print("load configure and convert the model") - ipex.nn.utils._model_convert.replace_dropout_with_identity(model) - from intel_extension_for_pytorch.quantization import prepare, convert - from torch.ao.quantization import MinMaxObserver, PerChannelMinMaxObserver, QConfig - qconfig = QConfig(activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_affine, dtype=torch.quint8), weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_channel_symmetric)) - prepared_model = prepare(model, qconfig, example_inputs=jit_inputs, inplace=False) - prepared_model.load_qconf_summary(qconf_summary = args.model_name_or_path + "/int8_configure.json") - if args.int8_bf16: - with torch.cpu.amp.autocast(): - model = convert(prepared_model) - model = torch.jit.trace(model, jit_inputs, strict=False) - else: - model = convert(prepared_model) - model = torch.jit.trace(model, jit_inputs, strict=False) - model = torch.jit.freeze(model) - - - with torch.no_grad(): - y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor) - y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor) - - # model.save("quantized_model.pt") - # import sys - # sys.exit(0) - - with track("Process bf16 model"): - if args.ipex_bf16: - model = ipex.optimize(model, dtype=torch.bfloat16, level='O0') - dumpy_tensor = torch.ones((training_args.per_device_eval_batch_size, max_seq_len), dtype=torch.long) - jit_inputs = (dumpy_tensor, dumpy_tensor, dumpy_tensor) - with torch.cpu.amp.autocast(), torch.no_grad(): - model = torch.jit.trace(model, jit_inputs, strict=False) - model = torch.jit.freeze(model) - with torch.no_grad(): - y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor) - y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor) - - if args.ipex_fp32: - model = ipex.optimize(model, dtype=torch.float32, level='O1') - - ############################### Inference ################################# - if training_args.do_predict: - with track('Inference'): - batch_size = training_args.per_device_eval_batch_size - all_outputs, all_labels = [], [] - - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - - def to_inputs(batch: dict) -> dict: - return {k: (v if torch.is_tensor(v) else tensor(v)).to(device=device) for k, v in batch.items()} - - def prediction_step(batch, labels): - all_labels.extend(labels) - inputs = to_inputs(batch) - output = model(inputs['input_ids'], inputs['attention_mask']) if args.torchscript \ - else model(**inputs) - all_outputs.append(output['logits'].detach().cpu()) - - model.eval() - - with torch.no_grad(): - if args.profiler: - with torch.profiler.profile( - schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=2), - on_trace_ready=torch.profiler.tensorboard_trace_handler('./profiler/' + args.profiler_name), - record_shapes=True, - profile_memory=True, - with_stack=True - ) as prof: - for batch in tqdm(DataLoader(test_data, batch_size=batch_size, - collate_fn=DataCollatorWithPadding(tokenizer))): - prediction_step(batch=batch, labels=batch.pop('labels')) - prof.step() - else: - for batch in tqdm(DataLoader(test_data, batch_size=batch_size, - collate_fn=DataCollatorWithPadding(tokenizer))): - prediction_step(batch=batch, labels=batch.pop('labels')) - - acc = compute_metrics(PredsLabels(preds=np.concatenate(all_outputs), labels=all_labels)) - print(f"\n*********** TEST_METRICS ***********\nAccuracy: {acc['acc']}\n") - - bench.summary() - - -if __name__ == "__main__": - main() diff --git a/profiling-transformers/src/utils.py b/profiling-transformers/src/utils.py index 85ff24c..35b1c8b 100644 --- a/profiling-transformers/src/utils.py +++ b/profiling-transformers/src/utils.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 Intel Corporation +# Copyright (C) 2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,28 +13,17 @@ # and limitations under the License. # -# - import json +from contextlib import contextmanager from dataclasses import dataclass, field -from typing import Optional from pathlib import Path -import numpy as np from time import perf_counter_ns -from dataclasses import dataclass, field +from typing import Optional + import numpy as np -from contextlib import contextmanager -import os SEC_TO_NS_SCALE = 1000000000 -SPLIT_PATHS = { - ('imdb', 'train'): './datasets/aclImdb/train', - ('imdb', 'test'): './datasets/aclImdb/test', - ('sst2', 'train'): './datasets/sst/train.tsv', - ('sst2', 'test'): './datasets/sst/dev.tsv' -} - @dataclass class Benchmark: @@ -109,7 +98,7 @@ class Arguments: profiler: int = field( default=0, metadata={ - "help": "wether using pytorch profiler" + "help": "whether using pytorch profiler" }, ) profiler_name: str = field( @@ -118,87 +107,60 @@ class Arguments: "help": "log name for pytorch profiler" }, ) - ipex: bool = field( - default=False, - metadata={ - "help": "Use IntelĀ® Extension for PyTorch for fine-Tuning." - }, - ) - ipex_bf16: int = field( - default=0, - metadata={ - "help": "Auto mixed precision using bfloat16." - }, - ) - ipex_fp32: int = field( - default=0, - metadata={ - "help": "Auto mixed precision using bfloat16." - }, + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, ) - bf16_ipex_ft: int = field( - default=False, - metadata={ - "help": "Auto mixed precision using bfloat16 to fine-tuning." - }, + overwrite_cache: bool = field( + default=True, metadata={"help": "Overwrite the cached training and evaluation sets."} ) - fp32_ipex_ft: int = field( + use_tpp: bool = field( default=False, metadata={ - "help": "use ipex optimization for fp32 fine-tuning." - }, - ) - int8_bf16: int = field( - default=0, - metadata={ - "help": "Auto mixed precision using int8+bfloat16." + "help": "Use TPP Extension for PyTorch for fine-Tuning." }, ) - multi_instance: bool = field( + unpad: bool = field( default=False, metadata={ - "help": "Whether to use multi-instance mode" - }, - ) - int8: int = field( - default=0, - metadata={ - "help": "Whether to do inference with int8 model" + "help": "Use TPP Extension for PyTorch for fine-Tuning." }, ) - dist_backend: Optional[str] = field( - default="ccl", metadata={"help": "Distributed backend to use"} + """ + Arguments for test scenarios + """ + infer_impl: Optional[str] = field( + default="trainer", metadata={ + "help": "The implementation of inference pipeline. Now we support trainer and ipex implementation." + } ) - preprocessing_num_workers: Optional[int] = field( - default=None, - metadata={"help": "The number of processes to use for the preprocessing."}, + + finetune_impl: Optional[str] = field( + default="trainer", metadata={ + "help": "The implementation of fine-tuning pipeline. Now we support trainer and ipex implementation." + } ) - overwrite_cache: bool = field( - default=True, metadata={"help": "Overwrite the cached training and evaluation sets."} + + dtype_inf: Optional[str] = field( + default="fp32", metadata={ + "help": "Data type for inference pipeline." + "Support fp32, bf16, and int8 for CPU. Support fp32 and fp16 for GPU " + } ) - real_time: bool = field( - default=False, metadata={"help": "Whether to pre-process the inputs in real-time."} + dtype_ft: Optional[str] = field( + default="fp32", metadata={ + "help": "Data type for inference pipeline." + "Support fp32 and bf16 for CPU. Support fp32, tf32, and fp16 for GPU " + } ) - few_shot: bool = field( + multi_instance: bool = field( default=False, metadata={ - "help": "Employ few-shot pattern-based MLM training on a small subset of the data." + "help": "Whether to use multi-instance mode" }, ) - pattern_id: bool = field( - default=0, metadata={"help": "Few-shot: pattern id of the pattern to use for few-shot training."} - ) - label_loss: bool = field( - default=True, metadata={"help": "Few-shot: whether to use label loss."} - ) - random_mlm: bool = field( - default=False, metadata={"help": "Few-shot: whether to use random MLM loss."} - ) - alpha: float = field( - default=0.6, metadata={"help": "Few-shot: alpha value for loss computation: ."} - ) - torchscript: bool = field( - default=False, metadata={"help": "Enable Torchscript."} + dist_backend: Optional[str] = field( + default="ccl", metadata={"help": "Distributed backend to use for fine-tuning"} ) @@ -213,100 +175,6 @@ def compute_metrics(p): return {"acc": (preds == p.label_ids).mean()} -def check_dataset(name: str): - if name == 'imdb': - return [name] - elif name == 'sst2': - return ['glue', 'sst2'] - else: - error_msg = f'Now only imdb and sst2 dataset are supported. Your dataset is {name}.' - raise ValueError(error_msg) - - -def read_dataset(name: str, split: str = "test", generator: bool = False, - return_labels: bool = True, batch_size: int = 1, max_samples: int = None): - split_path = SPLIT_PATHS[(name, split)] - args = split_path, return_labels, batch_size, max_samples - gen = imdb_gen(*args) if name == 'imdb' else sst_gen(*args) - - if generator: - return gen - - texts, labels = [], [] - for text_batch, label_batch in gen: - texts.extend(text_batch) - if return_labels: - labels.extend(label_batch) - return (texts, labels) if return_labels else texts - - -def imdb_gen(split_path, return_label, batch_size, max_samples): - text_batch, label_batch = [], [] - for label_dir in "pos", "neg": - for i, text_file in enumerate((Path(split_path) / label_dir).iterdir()): - text_batch.append(text_file.read_text()) - if return_label: - label_batch.append(0 if label_dir == 'neg' else 1) - if len(text_batch) == batch_size: - yield (text_batch, label_batch) if return_label else text_batch - text_batch, label_batch = [], [] - if max_samples is not None and i == max_samples / 2: - break - if text_batch: - yield (text_batch, label_batch) if return_label else text_batch - text_batch, label_batch = [], [] - - -def sst_gen(split_path, return_label, batch_size, max_samples): - text_batch, label_batch = [], [] - i = 0 - with open(split_path) as f: - for line in f.readlines()[1:]: - if line: - i += 1 - text, label = line.strip().split(" \t") - text_batch.append(text) - if return_label: - label_batch.append(int(label)) - if len(text_batch) == batch_size: - yield (text_batch, label_batch) if return_label else text_batch - text_batch, label_batch = [], [] - if max_samples is not None and i == max_samples: - break - if text_batch: - yield (text_batch, label_batch) if return_label else text_batch - text_batch, label_batch = [], [] - - -def to_tensor_dataset(framework, encodings, labels=None): - if framework == 'tf': - from tensorflow.data import Dataset - - data = (dict(encodings), labels) if labels else dict(encodings) - dataset = Dataset.from_tensor_slices(data) - - if framework == 'pt': - from torch import tensor - from torch.utils.data import Dataset - - class IMDbDataset(Dataset): - def __init__(self, encodings, labels): - self.encodings = encodings - self.labels = labels - - def __getitem__(self, idx): - item = {key: tensor(val[idx]) for key, val in self.encodings.items()} - item['labels'] = tensor(self.labels[idx]) - return item - - def __len__(self): - return len(self.labels) - - dataset = IMDbDataset(encodings, labels) - - return dataset - - def save_train_metrics(train_result, trainer, max_train): # pytorch only if train_result: @@ -321,23 +189,3 @@ def save_test_metrics(metrics, max_test, output_dir): with open(Path(output_dir) / 'test_results.json', 'w') as f: json.dump(metrics, f, indent=2) return "\n\n******** TEST METRICS ********\n" + '\n'.join(f'{k}: {v}' for k, v in metrics.items()) - - -def read_imdb_split(split_dir): - texts, labels = [], [] - for label_dir in "pos", "neg": - for text_file in (Path(split_dir) / label_dir).iterdir(): - texts.append(text_file.read_text()) - labels.append(0 if label_dir == 'neg' else 1) - return texts, labels - - -def read_sst_file(sst_file): - texts, labels = [], [] - with open(sst_file) as f: - for line in f.readlines()[1:]: - if line: - text, label = line.strip().split(" \t") - texts.append(text) - labels.append(int(label)) - return texts, labels