diff --git a/docs/fine-tuning/finetune_ipex.md b/docs/fine-tuning/finetune_ipex.md
new file mode 100644
index 0000000..30f35d1
--- /dev/null
+++ b/docs/fine-tuning/finetune_ipex.md
@@ -0,0 +1,60 @@
+# How to Run DLSA Fine-Tuning with IPEX(FP32, BF16)
+
+## Support Matrix
+
+| Categoty             | Script          |
+| -------------------  | --------------- |
+| IPEX Single Instance | ft_ipex.sh      |
+| IPEX Multi Instances | ft_ipex_ccl.sh  |
+
+## Single Instance Fine-Tuning
+
+```
+./fine-tuning/ft_ipex.sh 
+```
+
+By default, it will launch 1 instance to run fine-tuning with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script.
+
+Below is the help message by using the command `./fine-tuning/ft_ipex.sh -h`:
+
+```markdown
+Usage: ./fine-tuning/ft_ipex.sh [OPTIONS]
+OPTION includes:
+   -l | --log_name - the log name of this round
+   -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET
+   -b | --batch_size - batch size per instance
+   -s | --sequence_len - max sequence length
+   --dtype_ft - data type used for fine-tuning
+   --train_epoch - train epoch
+   -h | --help - displays this message
+```
+## Multi Instances Fine-Tuning
+
+
+### Running single instance
+
+```
+bash fine-tuning/run_dist.sh -np 1 -ppn 1 bash fine-tuning/ft_ipex_ccl.sh
+```
+
+By default, it will launch 1 instance to run fine-tuning with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script.
+
+### Running multi instances
+
+```
+bash fine-tuning/run_dist.sh -np 2 -ppn 2 bash fine-tuning/ft_ipex_ccl.sh
+```
+
+By default, it will launch 2 instances on single node to run fine-tuning with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script.
+
+> Note:
+>
+> np: num process, means how many processes you will run on a cluster
+>
+> ppn: process per node, means how many processes you will run on 1 worker node.
+>
+> For example, if I want to run on 2 nodes, each node runs with 1 process, use the config `-np 2 -ppn 1`
+>
+> if I want to run on 4 nodes, each node runs with 2 processes, use the config `-np 8 -ppn 2`
+>
+> You can also use `-l $log_name` after `run_dist.sh` to set the log name.
diff --git a/docs/fine-tuning/finetune_tpp.md b/docs/fine-tuning/finetune_tpp.md
new file mode 100644
index 0000000..6bbcd02
--- /dev/null
+++ b/docs/fine-tuning/finetune_tpp.md
@@ -0,0 +1,60 @@
+# How to Run DLSA Fine-Tuning with IPEX(FP32, BF16)
+
+## Support Matrix
+
+| Categoty            | Script         |
+| ------------------- | -------------- |
+| TPP Single Instance | ft_tpp.sh      |
+| TPP Multi Instances | ft_tpp_ccl.sh  |
+
+## Single Instance Fine-Tuning
+
+```
+./fine-tuning/ft_ipex.sh 
+```
+
+By default, it will launch 1 instance to run fine-tuning with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script.
+
+Below is the help message by using the command `./fine-tuning/ft_ipex.sh -h`:
+
+```markdown
+Usage: ./fine-tuning/ft_ipex.sh [OPTIONS]
+OPTION includes:
+   -l | --log_name - the log name of this round
+   -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET
+   -b | --batch_size - batch size per instance
+   -s | --sequence_len - max sequence length
+   --dtype_ft - data type used for fine-tuning
+   --train_epoch - train epoch
+   -h | --help - displays this message
+```
+## Multi Instances Fine-Tuning
+
+
+### Running single instance
+
+```
+bash fine-tuning/run_dist.sh -np 1 -ppn 1 bash fine-tuning/ft_tpp_ccl.sh
+```
+
+By default, it will launch 1 instance to run fine-tuning with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script.
+
+### Running multi instances
+
+```
+bash fine-tuning/run_dist.sh -np 2 -ppn 2 bash fine-tuning/ft_tpp_ccl.sh
+```
+
+By default, it will launch 2 instances on single node to run fine-tuning with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script.
+
+> Note:
+>
+> np: num process, means how many processes you will run on a cluster
+>
+> ppn: process per node, means how many processes you will run on 1 worker node.
+>
+> For example, if I want to run on 2 nodes, each node runs with 1 process, use the config `-np 2 -ppn 1`
+>
+> if I want to run on 4 nodes, each node runs with 2 processes, use the config `-np 8 -ppn 2`
+>
+> You can also use `-l $log_name` after `run_dist.sh` to set the log name.
diff --git a/docs/fine-tuning/finetune_trainer.md b/docs/fine-tuning/finetune_trainer.md
new file mode 100644
index 0000000..5902a8e
--- /dev/null
+++ b/docs/fine-tuning/finetune_trainer.md
@@ -0,0 +1,22 @@
+# How to Run DLSA Single Node Fine-Tuning with HF Trainer(FP32, BF16)
+
+## Single instance Fine-Tuning
+
+```
+./fine-tuning/ft_trainer.sh
+```
+
+By default, it will launch 1 instance to run fine-tuning with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script.
+
+Below is the help message by using the command `./fine-tuning/ft_trainer.sh -h`:
+
+```markdown
+Usage: ./fine-tuning/ft_trainer.sh [OPTIONS]
+OPTION includes:
+   -l | --log_name - the log name of this round
+   -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET
+   -b | --batch_size - batch size per instance
+   -s | --sequence_len - max sequence length
+   --dtype_ft - data type used for fine-tuning
+   -h | --help - displays this message
+```
diff --git a/docs/fine-tuning/multi-nodes-ipex.md b/docs/fine-tuning/multi-nodes-ipex.md
deleted file mode 100644
index a51558e..0000000
--- a/docs/fine-tuning/multi-nodes-ipex.md
+++ /dev/null
@@ -1,50 +0,0 @@
-# How to Run DLSA Multi Instance Fine-Tuning with IPEX (FP32, BF16)
-
-## Install MPI library:
-
-Install MPI from [here]( https://anaconda.org/intel/impi_rt )
-
-MPI is included in the Intel OneAPI Toolkit. It's recommended to use the package manager to install.
-
->  Note: This step should be operated on all the work nodes
-
-## To run:
-
-```
-source /opt/intel/oneapi/mpi/latest/env/vars.sh
-cd profiling-transformers
-```
-
-> Note:
->
-> np: num process, means how many processes you will run on a cluster
->
-> ppn: process per node, means how many processes you will run on 1 worker node.
->
-> For example, if I want to run on 2 nodes, each node runs with 1 process, use the config `-np 2 -ppn 1`
->
-> if I want to run on 4 nodes, each node runs with 2 processes, use the config `-np 8 -ppn 2`
-
-### Running single process in single node
-
-```
-bash fine-tuning/run_dist.sh -np 1 -ppn 1 bash fine-tuning/run_ipex_native.sh
-```
-
-### Running multi instances in single node
-
-```
-# Run 2 instances in single node
-bash fine-tuning/run_dist.sh -np 2 -ppn 2 bash fine-tuning/run_ipex_native.sh
-```
-
-### Running with IPEX BF16
-
-> Before you run BF16 fine-tuning, you need to verify whether your server supports BF16. (Only Copper Lake & Sapphire Rapids CPUs support BF16)
-
-add `--bf16_ipex_ft` at the end of the command:
-
-```
-bash fine-tuning/run_dist.sh -np 2 -ppn 2 bash fine-tuning/run_ipex_native.sh --bf16_ipex_ft 1
-```
-
diff --git a/docs/fine-tuning/multi-nodes-stock-pytorch.md b/docs/fine-tuning/multi-nodes-stock-pytorch.md
deleted file mode 100644
index 10e4320..0000000
--- a/docs/fine-tuning/multi-nodes-stock-pytorch.md
+++ /dev/null
@@ -1,44 +0,0 @@
-# How to Run DLSA Multi Node Fine-Tuning with Stock PyTorch(FP32)
-
-## Install MPI library:
-
-Install MPI from [here]( https://anaconda.org/intel/impi_rt )
-
-
-MPI is included in the Intel OneAPI Toolkit. It's recommended to use the package manager to install.
-
->  Note: This step should be operated on all the work nodes
-
-## To run:
-
-```
-source /opt/intel/oneapi/mpi/latest/env/vars.sh
-cd profiling-transformers
-```
-
-> Note:
->
-> np: num process, means how many processes you will run on a cluster
->
-> ppn: process per node, means how many processes you will run on 1 worker node.
->
-> For example, if I want to run on 2 nodes, each node runs with 1 process, use the config `-np 2 -ppn 1`
->
-> if I want to run on 4 nodes, each node runs with 2 processes, use the config `-np 8 -ppn 2`
-
-### Running single process in single node
-
-```
-bash fine-tuning/run_dist.sh -np 1 -ppn 1 bash fine-tuning/run_ipex_native.sh
-```
-
-### Running multi-node fine-tuning
-
-> You need to create the `hostfile` which contains all nodes you want to run on and set password-free login.
-
-```
-bash fine-tuning/run_dist.sh -np 2 -ppn 1 -f hostfile bash fine-tuning/run_ipex_native.sh
-```
-
-
-
diff --git a/docs/fine-tuning/single-node-ipex.md b/docs/fine-tuning/single-node-ipex.md
deleted file mode 100644
index 4baf9cd..0000000
--- a/docs/fine-tuning/single-node-ipex.md
+++ /dev/null
@@ -1,28 +0,0 @@
-# How to Run DLSA Single Node Fine-Tuning with IPEX(FP32, BF16)
-
-## Running on CPU
-
-### Single node
-
-```
-./fine-tuning/train_native.sh
-```
-
-By default, it will launch 1 instance to run fine-tuning with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script.
-
-Below is the help message by using the command `./fine-tuning/train_native.sh -h`:
-
-```markdown
-Usage: ./fine-tuning/train_native.sh [OPTIONS]
-OPTION includes:
-   -l | --log_name - the log name of this round
-   -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET
-   -b | --batch_size - batch size per instance
-   -s | --sequence_len - max sequence length
-   --bf16_ipex_ft - wether to use bf16_ipex_ft precision
-   --fp32_ipex_ft - wether to use fp32_ipex_ft precision
-   -h | --help - displays this message
-```
-
-
-
diff --git a/docs/fine-tuning/single-node-stock-pytorch.md b/docs/fine-tuning/single-node-stock-pytorch.md
deleted file mode 100644
index 8b5f42f..0000000
--- a/docs/fine-tuning/single-node-stock-pytorch.md
+++ /dev/null
@@ -1,26 +0,0 @@
-# How to Run DLSA Single Node Fine-Tuning Pipeline with Stock PyTorch
-
-## Running on CPU
-
-### Single node
-
-```
-./fine-tuning/train_native.sh
-```
-
-By default, it will launch 1 instance to run fine-tuning with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script.
-
-Below is the help message by using the command `./fine-tuning/train_native.sh -h`:
-
-```markdown
-Usage: ./fine-tuning/train_native.sh [OPTIONS]
-OPTION includes:
-   -l | --log_name - the log name of this round
-   -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET
-   -b | --batch_size - batch size per instance
-   -s | --sequence_len - max sequence length
-   ~~--bf16_ipex_ft - wether to use bf16_ipex_ft precision~~
-   ~~--fp32_ipex_ft - wether to use fp32_ipex_ft precision~~
-   -h | --help - displays this message
-```
-
diff --git a/docs/fine-tuning/single-node-trainer.md b/docs/fine-tuning/single-node-trainer.md
deleted file mode 100644
index 11da772..0000000
--- a/docs/fine-tuning/single-node-trainer.md
+++ /dev/null
@@ -1,28 +0,0 @@
-# How to Run DLSA Single Node Fine-Tuning with Trainer(FP32, BF16)
-
-## Running on CPU
-
-### Single node
-
-```
-./fine-tuning/train_trainer.sh
-```
-
-By default, it will launch 1 instance to run fine-tuning with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script.
-
-Below is the help message by using the command `./fine-tuning/train_native.sh -h`:
-
-```markdown
-Usage: ./fine-tuning/train_trainer.sh [OPTIONS]
-OPTION includes:
-   -l | --log_name - the log name of this round
-   -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET
-   -b | --batch_size - batch size per instance
-   -s | --sequence_len - max sequence length
-   --bf16 - whether using hf bf16 inference
-   --use_ipex - whether using ipex
-   -h | --help - displays this message
-```
-
-
-
diff --git a/docs/index.md b/docs/index.md
index 61f8f2d..a2c0932 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -4,30 +4,19 @@ DLSA is Intel optimized representative End-to-end Fine-Tuning & Inference pipeli
 
 ![Image](assets/images/DLSA_workflow.PNG)
 
-## Prerequisites
-### Download the repo
+## Run on bare-metal
+
+### Prerequisites
+#### Download the repo
 
 ```
 #download the repo
-git clone https://github.com/intel/document-level-sentiment-analysis.git
+git clone https://github.com/intel-innersource/frameworks.ai.end2end-ai-pipelines.dlsa.git
 cd frameworks.ai.end2end-ai-pipelines.dlsa/profiling-transformers
-git checkout v1.0.0
-```
-
-### Download the datasets:
-
-```
-mkdir datasets
-cd datasets
-#download and extract SST-2 dataset
-wget https://dl.fbaipublicfiles.com/glue/data/SST-2.zip && unzip SST-2.zip && mv SST-2 sst
-#download and extract IMDB dataset
-wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz && tar -zxf aclImdb_v1.tar.gz
 ```
->Note: Make sure the network connections work well for downloading the datasets.
 
-## Deploy the test environment 
-### Download Miniconda and install it
+### Deploy the test environment (Bare-metal)
+#### Download Miniconda and install it
 
 ```
 wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
@@ -36,7 +25,7 @@ sh Miniconda3-latest-Linux-x86_64.sh
 
 > Note: If you have already installed conda on your system, just skip this step.
 
-### Prepare the conda environment for DLSA
+#### Prepare the conda environment for DLSA
 
 ```
 conda create -n dlsa python=3.8 --yes
@@ -44,26 +33,30 @@ conda activate dlsa
 sh install.sh
 ```
 
-## Running DLSA Inference Pipeline
+### Running DLSA Inference Pipeline
 
-| Implementations                                          | Model    | API         | Framework      | Precision      |
-| -------------------------------------------------------- | -------- | ----------- | -------------- | -------------- |
-| [Run with HF Transformers](inference/hf-transformers.md) | HF Model | Trainer     | PyTorch + IPEX | FP32,BF16      |
-| [Run with Stock Pytorch](inference/stock-pytorch.md)     | HF Mode  | Non-trainer | PyTorch        | FP32           |
-| [Run with IPEX](inference/ipex.md)                       | HF Mode  | Non-trainer | PyTorch + IPEX | FP32,BF16,INT8 |
+| Implementations                              | Model    | Instance     | Framework      | Precision      |
+| -------------------------------------------- | -------- | -----------  | -------------- | -------------- |
+| [HF Trainer](inference/infer_trainer.md)     | HF Model | Single/Multi | PyTorch + IPEX | FP32,BF16      |
+| [IPEX](inference/infer_ipex.md)              | HF Model | Single/Multi | PyTorch + IPEX | FP32,BF16,INT8 |
 
-## Running DLSA Fine-Tuning Pipeline
+### Running DLSA Fine-Tuning Pipeline
 
-### Single Node Fine-Tuning
 
-|  Implementations                               | Model    | Instance | API         | Framework       | Precision  |
-| ---------------------------------- | -------- | -------- | ----------- | ----------------------- | ---------- |
-| [Run with HF Transformers + IPEX ](fine-tuning/single-node-trainer.md)   | HF Model | Single   | Trainer     | PyTorch + IPEX          | FP32, BF16 |
-| [Run with Stock Pytorch](fine-tuning/single-node-stock-pytorch.md) | HF Model  | Single   | Non-trainer | PyTorch                 | FP32       |
-| [Run with IPEX (Single Instance)](fine-tuning/single-node-ipex.md) | HF Model  | Single   | Non-trainer | PyTorch + IPEX          | FP32,BF16  |
-| [Run with IPEX (Multi Instance)](fine-tuning/multi-nodes-ipex.md) | HF Model  | Multiple | Non-trainer | PyTorch + IPEX          | FP32,BF16  |
+| Implementations                               | Model     | Instance     | Framework       | Precision  |
+| --------------------------------------------- | --------- | -----------  | --------------- | ---------- |
+| [HF Trainer](fine-tuning/finetune_trainer.md) | HF Model  | Single       | PyTorch + IPEX  | FP32, BF16 |
+| [IPEX](fine-tuning/finetune_ipex.md)          | HF Model  | Single/Multi | PyTorch + IPEX  | FP32,BF16  |
+| [TPP](fine-tuning/finetune_tpp.md)            | HF Model  | Single/Multi | PyTorch + TPP   | FP32,BF16  |
+
+
+
+## Run on docker
+
+Please follow the directions from [intel ai-workflows](https://github.com/intel/ai-workflows/tree/main/language_modeling/pytorch/bert_large/training) to run DLSA on docker.
 
 
 ## Issue Tracking 
-E2E DLSA tracks both bugs and enhancement requests using [Github](https://github.com/intel/document-level-sentiment-analysis/issues). We welcome input, however, before filing a request, please make sure you do the following:
+E2E DLSA tracks both bugs and enhancement requests using [Github](https://github.com/intel-innersource/frameworks.ai.end2end-ai-pipelines.dlsa/issues). We welcome input, however, before filing a request, please make sure you do the following:
+
 Search the Github issue database.
diff --git a/docs/inference/ipex.md b/docs/inference/infer_ipex.md
similarity index 61%
rename from docs/inference/ipex.md
rename to docs/inference/infer_ipex.md
index e91c5df..78513ad 100644
--- a/docs/inference/ipex.md
+++ b/docs/inference/infer_ipex.md
@@ -4,8 +4,8 @@
 
 | Categoty            | Script             |
 | ------------------- | ------------------ |
-| CPU Single Instance | single_instance.sh |
-| CPU Multi Instances | multi_instance.sh  |
+| IPEX Single Instance | inf_ipex_single.sh |
+| IPEX Multi Instances | inf_ipex_multi.sh  |
 
 > Note: Please use the fine-tuned model for correct accuracy. Just change the `MODEL_NAME_OR_PATH` in the script before you running. By default, the `MODEL_NAME_OR_PATH` is `bert-large-uncased` which is downloaded from the Hugging Face website.
 
@@ -16,24 +16,21 @@
 ### Single instance
 
 ```
-./inference/single_instance.sh
+./inference/inf_ipex_single.sh
 ```
 
 By default, it will launch 1 instance to run inference with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script.
 
-Below is the help message by using the command `./inference/single_instance.sh -h`:
+Below is the help message by using the command `./inference/inf_ipex_single.sh -h`:
 
 ```markdown
-Usage: ./inference/single_instance.sh [OPTIONS]
+Usage: ./inference/inf_ipex_single.sh [OPTIONS]
 OPTION includes:
    -l | --log_name - the log name of this round
-   -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET
+   -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET
    -b | --batch_size - batch size per instance
    -s | --sequence_len - max sequence length
-   --ipex_fp32 - wether to use ipex_fp32 precision
-   --ipex_bf16 - wether to use ipex_bf16 precision
-   --int8 - wether to use int8 precision
-   --int8_bf16 - wether to use int8_bf16 precision
+   --dtype_inf - data type used for inference
    -h | --help - displays this message
 ```
 
@@ -42,24 +39,21 @@ OPTION includes:
 ### Multi-instance
 
 ```
-./inference/multi_instance.sh
+./inference/inf_ipex_multi.sh
 ```
 
 By default, it will launch 2 instances (1 instance/socket) to run inference with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script.
 
-Below is the help message by using the command `./inference/multi_instance.sh -h`
+Below is the help message by using the command `./inference/inf_ipex_multi.sh -h`
 
 ```markdown
-Usage: ./inference/multi_instance.sh [OPTIONS]
+Usage: ./inference/inf_ipex_multi.sh [OPTIONS]
 OPTION includes:
    -l | --log_name - the log name of this round
-   -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET
+   -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET
    -n | --num_of_ins_per_socket - number of instance per socket
    -b | --batch_size - batch size per instance
    -s | --sequence_len - max sequence length
-   --ipex_fp32 - wether to use ipex_fp32 precision
-   --ipex_bf16 - wether to use ipex_bf16 precision
-   --int8 - wether to use int8 precision
-   --int8_bf16 - wether to use int8_bf16 precision
+   --dtype_inf - data type used for inference
    -h | --help - displays this message
 ```
diff --git a/docs/inference/hf-transformers.md b/docs/inference/infer_trainer.md
similarity index 62%
rename from docs/inference/hf-transformers.md
rename to docs/inference/infer_trainer.md
index cf73841..8354086 100644
--- a/docs/inference/hf-transformers.md
+++ b/docs/inference/infer_trainer.md
@@ -4,8 +4,8 @@
 
 |Categoty             |  Script |
 |---|---|
-|CPU Single Instance  |  cpu_single_instance.sh |
-|CPU Multi Instances  |  cpu_multi_instance.sh |
+|CPU Single Instance  |  inf_trainer_single.sh |
+|CPU Multi Instances  |  inf_trainer_multi.sh |
 
 > Note: Please use the fine-tuned model for correct accuracy. Just change the `MODEL_NAME_OR_PATH` in the script before you running. By default, the `MODEL_NAME_OR_PATH` is `bert-large-uncased` which is downloaded from the Hugging Face website.
 
@@ -14,22 +14,21 @@
 ### Single instance
 
 ```
-./inference/cpu_single_instance.sh
+./inference/inf_trainer_single.sh
 ```
 
 By default, it will launch 1 instance to run inference with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script.
 
-Below is the help message by using the command `./inference/cpu_single_instance.sh -h`:
+Below is the help message by using the command `./inference/inf_trainer_single.sh -h`:
 
 ```markdown
-Usage: ./inference/cpu_single_instance.sh [OPTIONS]
+Usage: ./inference/inf_trainer_single.sh [OPTIONS]
 OPTION includes:
    -l | --log_name - the log name of this round
-   -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET
+   -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET
    -b | --batch_size - batch size per instance
    -s | --sequence_len - max sequence length
-   --bf16 - whether using hf bf16 inference
-   --use_ipex - whether using ipex
+   --dtype_inf - data type used for inference
    -h | --help - displays this message
 ```
 
@@ -38,22 +37,21 @@ OPTION includes:
 ### Multi-instance
 
 ```
-./inference/cpu_multi_instance.sh
+./inference/inf_trainer_multi.sh
 ```
 
 By default, it will launch 2 instances (1 instance/socket) to run inference with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script.
 
-Below is the help message by using the command `./inference/cpu_multi_instance.sh -h`
+Below is the help message by using the command `./inference/inf_trainer_multi.sh -h`
 
 ```markdown
-Usage: ./inference/cpu_multi_instance.sh [OPTIONS]
+Usage: ./inference/inf_trainer_multi.sh [OPTIONS]
 OPTION includes:
    -l | --log_name - the log name of this round
-   -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET
+   -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET
    -n | --num_of_ins_per_socket - number of instance per socket
    -b | --batch_size - batch size per instance
    -s | --sequence_len - max sequence length
-   --bf16 - whether using hf bf16 inference
-   --use_ipex - whether using ipex
+   --dtype_inf - data type used for inference
    -h | --help - displays this message
 ```
diff --git a/docs/inference/stock-pytorch.md b/docs/inference/stock-pytorch.md
deleted file mode 100644
index 2525fb6..0000000
--- a/docs/inference/stock-pytorch.md
+++ /dev/null
@@ -1,64 +0,0 @@
-# How to Run DLSA Inference Pipeline with Stock PyTorch
-
-## Support Matrix
-
-|Categoty             |  Script |
-|---|---|
-|CPU Single Instance  |  single_instance.sh |
-|CPU Multi Instances  |  multi_instance.sh |
-
-> Note: Please use the fine-tuned model for correct accuracy. Just change the `MODEL_NAME_OR_PATH` in the script before you running. By default, the `MODEL_NAME_OR_PATH` is `bert-large-uncased` which is downloaded from the Hugging Face website.
-
-## Running on CPU
-
-### Single instance
-
-```
-./inference/single_instance.sh
-```
-
-By default, it will launch 1 instance to run inference with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script.
-
-Below is the help message by using the command `./inference/single_instance.sh -h`:
-
-```markdown
-Usage: ./inference/single_instance.sh [OPTIONS]
-OPTION includes:
-   -l | --log_name - the log name of this round
-   -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET
-   -b | --batch_size - batch size per instance
-   -s | --sequence_len - max sequence length
-   ~~--ipex_fp32 - wether to use ipex_fp32 precision~~
-   ~~--ipex_bf16 - wether to use ipex_bf16 precision~~
-   ~~--int8 - wether to use int8 precision~~
-   ~~--int8_bf16 - wether to use int8_bf16 precision~~
-   -h | --help - displays this message
-```
-
-
-
-### Multi-instance
-
-```
-./inference/multi_instance.sh
-```
-
-By default, it will launch 2 instances (1 instance/socket) to run inference with SST-2 dataset and FP32 precision. You can change the configurations in the file or pass parameters when running the script.
-
-Below is the help message by using the command `./inference/multi_instance.sh -h`
-
-```markdown
-Usage: ./inference/multi_instance.sh [OPTIONS]
-OPTION includes:
-   -l | --log_name - the log name of this round
-   -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET
-   -n | --num_of_ins_per_socket - number of instance per socket
-   -b | --batch_size - batch size per instance
-   -s | --sequence_len - max sequence length
-   ~~--ipex_fp32 - wether to use ipex_fp32 precision~~
-   ~~--ipex_bf16 - wether to use ipex_bf16 precision~~
-   ~~--int8 - wether to use int8 precision~~
-   ~~--int8_bf16 - wether to use int8_bf16 precision~~
-   -h | --help - displays this message
-```
-
diff --git a/profiling-transformers/deploy/install_torch_ccl.sh b/profiling-transformers/deploy/install_torch_ccl.sh
index 3fa177d..b28e20b 100755
--- a/profiling-transformers/deploy/install_torch_ccl.sh
+++ b/profiling-transformers/deploy/install_torch_ccl.sh
@@ -15,7 +15,6 @@
 # and limitations under the License.
 #
 
-#
 
 GCC_GOOD=`gcc --version | awk '/gcc/ && ($3+0)>=8.3{print "1"}'`
 if [ "x$GCC_GOOD" != "x1" ] ; then
diff --git a/profiling-transformers/deploy/install_tpp.sh b/profiling-transformers/deploy/install_tpp.sh
new file mode 100755
index 0000000..fa72d6b
--- /dev/null
+++ b/profiling-transformers/deploy/install_tpp.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# Copyright (C) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+
+
+GCC_GOOD=`gcc --version | awk '/gcc/ && ($3+0)>=8.3{print "1"}'`
+if [ "x$GCC_GOOD" != "x1" ] ; then
+  echo "Requires gcc version later than 8.3.0"
+  exit 1
+fi
+
+pt_version=$(python -c "import torch; print(torch.__version__)" 2> /dev/null)
+if [ "x$pt_version" == "x" ] ; then
+  echo "Can't find pytorch version, need PyTorch 1.9 or higher..."
+  exit 1
+fi
+
+
+if ! test -d ./tpp-pytorch-extension ; then
+  git clone https://github.com/libxsmm/tpp-pytorch-extension.git
+fi
+cd tpp-pytorch-extension
+git submodule update --init && python setup.py install
+
diff --git a/profiling-transformers/fine-tuning/train_native.sh b/profiling-transformers/fine-tuning/ft_ipex.sh
similarity index 55%
rename from profiling-transformers/fine-tuning/train_native.sh
rename to profiling-transformers/fine-tuning/ft_ipex.sh
index d22fd65..1c65a4f 100755
--- a/profiling-transformers/fine-tuning/train_native.sh
+++ b/profiling-transformers/fine-tuning/ft_ipex.sh
@@ -1,4 +1,6 @@
-# Copyright (C) 2022 Intel Corporation
+#!/bin/bash
+
+# Copyright (C) 2022 Intel Corporation                                                                                              
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,17 +15,14 @@
 # and limitations under the License.
 #
 
-#
-
-export LOG_NAME=`date "+%m%d-%H%M"`
-export DATASET="sst2"
-export BATCH_SIZE=32
-export SEQUENCE_LEN=55
-export BF16_IPEX_FT=0
-export FP32_IPEX_FT=0
-export TRAIN_EPOCH=1
-export MODEL_NAME_OR_PATH="bert-large-uncased"
-export OUTPUT_DIR="${OUTPUT_DIR:-./logs}"
+LOG_NAME=$(date "+%m%d-%H%M")
+DATASET="sst2"
+BATCH_SIZE=32
+SEQUENCE_LEN=55
+DTYPE_FT="fp32"
+TRAIN_EPOCH=1
+MODEL_NAME_OR_PATH="bert-large-uncased"
+OUTPUT_DIR="${OUTPUT_DIR:-./logs}"
 
 while [ "$1" != "" ];
 do
@@ -48,36 +47,38 @@ do
         SEQUENCE_LEN="$1"
         echo "sequence_len is : $SEQUENCE_LEN"
         ;;
-    --bf16_ipex_ft )
-        BF16_IPEX_FT=1
-        echo "bf16_ipex_ft is : $BF16_IPEX_FT"
+    --dtype_ft )
+        shift
+        DTYPE_FT="$1"
+        echo "dtype_ft is : $DTYPE_FT"
         ;;
-    --fp32_ipex_ft )
-        FP32_IPEX_FT=1
-        echo "fp32_ipex_ft is : $FP32_IPEX_FT"
+    --train_epoch )
+        shift
+        TRAIN_EPOCH="$1"
+        echo "train_epoch is : $TRAIN_EPOCH"
         ;;
     -h | --help )
-         echo "Usage: ./train_native.sh [OPTIONS]"
+         echo "Usage: $0 [OPTIONS]"
          echo "OPTION includes:"
          echo "   -l | --log_name - the log name of this round"
-         echo "   -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET"
+         echo "   -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET"
          echo "   -b | --batch_size - batch size per instance"
          echo "   -s | --sequence_len - max sequence length"
-         echo "   --bf16_ipex_ft - wether to use bf16_ipex_ft precision"
-         echo "   --fp32_ipex_ft - wether to use fp32_ipex_ft precision"
+         echo "   --dtype_ft - data type used for fine-tuning"
+         echo "   --train_epoch - train epoch"
          echo "   -h | --help - displays this message"
          exit
       ;;
     * )
         echo "Invalid option: $1"
-        echo "Usage: train_native.sh [OPTIONS]"
+        echo "Usage: $0 [OPTIONS]"
         echo "OPTION includes:"
         echo "   -l | --log_name - the log name of this round"
-        echo "   -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET"
+        echo "   -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET"
         echo "   -b | --batch_size - batch size per instance"
         echo "   -s | --sequence_len - max sequence length"
-        echo "   --bf16_ipex_ft - wether to use bf16_ipex_ft precision"
-        echo "   --fp32_ipex_ft - wether to use fp32_ipex_ft precision"
+        echo "   --dtype_ft - data type used for fine-tuning"
+        echo "   --train_epoch - train epoch"
         exit
        ;;
   esac
@@ -85,30 +86,27 @@ do
 done
 
 if [ -z "$LOG_NAME" ]; then
-    pre=`date "+%m%d-%H%M"`
+    pre=$(date "+%m%d-%H%M")
 else
     pre=$LOG_NAME
 fi
 
 OUTPUT_DIR=$OUTPUT_DIR'/'$pre'/'$DATASET
-echo $OUTPUT_DIR
-
-mkdir -p $OUTPUT_DIR
+echo "$OUTPUT_DIR"
 
+mkdir -p "$OUTPUT_DIR"/output_test
 
 export CUDA_VISIBLE_DEVICES="-1"; \
-python ./src/run_pt_native.py \
-        --model_name_or_path $MODEL_NAME_OR_PATH \
-        --dataset $DATASET \
-        --bf16_ipex_ft $BF16_IPEX_FT \
-        --fp32_ipex_ft $FP32_IPEX_FT \
-        --output_dir $OUTPUT_DIR/output_test \
-        --max_seq_len $SEQUENCE_LEN \
-	    --num_train_epochs $TRAIN_EPOCH \
-	    --do_train \
-	    --per_device_train_batch_size $BATCH_SIZE \
+python ./src/run_finetune.py \
+        --model_name_or_path "$MODEL_NAME_OR_PATH" \
+        --dataset "$DATASET" \
+        --output_dir "$OUTPUT_DIR"/output_test \
+        --finetune_impl ipex \
+        --dtype_ft "$DTYPE_FT" \
+        --do_train \
+        --max_seq_len "$SEQUENCE_LEN" \
+        --num_train_epochs "$TRAIN_EPOCH" \
+        --per_device_train_batch_size "$BATCH_SIZE" \
         --do_predict \
         --per_device_eval_batch_size 8 \
-	    2>&1 | tee $OUTPUT_DIR/test_$i.log
-
-
+        2>&1 | tee "$OUTPUT_DIR"/test.log
diff --git a/profiling-transformers/fine-tuning/ft_ipex_ccl.sh b/profiling-transformers/fine-tuning/ft_ipex_ccl.sh
new file mode 100755
index 0000000..711b68f
--- /dev/null
+++ b/profiling-transformers/fine-tuning/ft_ipex_ccl.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+
+# Copyright (C) 2022 Intel Corporation                                                                                              
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+
+DATASET="sst2"
+BATCH_SIZE=32
+SEQUENCE_LEN=55
+DTYPE_FT="fp32"
+TRAIN_EPOCH=1
+MODEL_NAME_OR_PATH="bert-large-uncased"
+
+while [ "$1" != "" ];
+do
+   case $1 in
+    -d | --dataset )
+        shift
+        DATASET="$1"
+        echo "dataset is : $DATASET"
+        ;;
+    -o )
+        shift
+        OUTPUT_DIR="$1"
+        echo "output_dir is : $OUTPUT_DIR"
+        ;;
+    -b | --batch_size )
+        shift
+        BATCH_SIZE="$1"
+        echo "batch size per instance is : $BATCH_SIZE"
+        ;;
+    -s | --sequence_len )
+        shift
+        SEQUENCE_LEN="$1"
+        echo "sequence_len is : $SEQUENCE_LEN"
+        ;;
+    --dtype_ft )
+        shift
+        DTYPE_FT="$1"
+        echo "dtype_ft is : $DTYPE_FT"
+        ;;
+    --train_epoch )
+        shift
+        TRAIN_EPOCH="$1"
+        echo "train_epoch is : $TRAIN_EPOCH"
+        ;;
+    -h | --help )
+         echo "Usage: $0 [OPTIONS]"
+         echo "OPTION includes:"
+         echo "   -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET"
+         echo "   -b | --batch_size - batch size per instance"
+         echo "   -s | --sequence_len - max sequence length"
+         echo "   --dtype_ft - data type used for fine-tuning"
+         echo "   --train_epoch - train epoch"
+         echo "   -h | --help - displays this message"
+         exit
+      ;;
+    * )
+        echo "Invalid option: $1"
+        echo "Usage: $0 [OPTIONS]"
+        echo "OPTION includes:"
+        echo "   -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET"
+        echo "   -b | --batch_size - batch size per instance"
+        echo "   -s | --sequence_len - max sequence length"
+        echo "   --dtype_ft - data type used for fine-tuning"
+        echo "   --train_epoch - train epoch"
+        exit
+       ;;
+  esac
+  shift
+done
+
+export CUDA_VISIBLE_DEVICES="-1"; \
+python ./src/run_finetune.py \
+        --model_name_or_path "$MODEL_NAME_OR_PATH" \
+        --dataset "$DATASET" \
+        --output_dir "$OUTPUT_DIR"/output_test \
+        --finetune_impl ipex_ccl \
+        --dtype_ft "$DTYPE_FT" \
+        --do_train \
+        --max_seq_len "$SEQUENCE_LEN" \
+        --num_train_epochs "$TRAIN_EPOCH" \
+        --per_device_train_batch_size "$BATCH_SIZE" \
+        --do_predict \
+        --per_device_eval_batch_size 8 
diff --git a/profiling-transformers/fine-tuning/train_trainer.sh b/profiling-transformers/fine-tuning/ft_tpp.sh
similarity index 58%
rename from profiling-transformers/fine-tuning/train_trainer.sh
rename to profiling-transformers/fine-tuning/ft_tpp.sh
index ece76b0..4a01006 100755
--- a/profiling-transformers/fine-tuning/train_trainer.sh
+++ b/profiling-transformers/fine-tuning/ft_tpp.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 # Copyright (C) 2022 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,17 +15,14 @@
 # and limitations under the License.
 #
 
-#
-
-export LOG_NAME=`date "+%m%d-%H%M"`
-export DATASET="sst2"
-export BATCH_SIZE=32
-export SEQUENCE_LEN=55
-export BF16=""
-export USE_IPEX=""
-export TRAIN_EPOCH=1
-export MODEL_NAME_OR_PATH="bert-large-uncased"
-export OUTPUT_DIR="${OUTPUT_DIR:-./logs}"
+LOG_NAME=$(date "+%m%d-%H%M")
+DATASET="imdb"
+BATCH_SIZE=32
+SEQUENCE_LEN=512
+DTYPE_FT="fp32"
+TRAIN_EPOCH=1
+MODEL_NAME_OR_PATH="bert-large-uncased"
+OUTPUT_DIR="${OUTPUT_DIR:-./logs}"
 
 while [ "$1" != "" ];
 do
@@ -48,36 +47,38 @@ do
         SEQUENCE_LEN="$1"
         echo "sequence_len is : $SEQUENCE_LEN"
         ;;
-    --bf16 )
-        BF16="--bf16"
-        echo "use bf16"
+    --dtype_ft )
+        shift
+        DTYPE_FT="$1"
+        echo "dtype_ft is : $DTYPE_FT"
         ;;
-    --use_ipex )
-        USE_IPEX=1
-        echo "use_ipex is : $USE_IPEX"
+    --train_epoch )
+        shift
+        TRAIN_EPOCH="$1"
+        echo "train_epoch is : $TRAIN_EPOCH"
         ;;
     -h | --help )
-         echo "Usage: ./fine-tuning/train_trainer.sh [OPTIONS]"
+         echo "Usage: $0 [OPTIONS]"
          echo "OPTION includes:"
          echo "   -l | --log_name - the log name of this round"
-         echo "   -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET"
+         echo "   -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET"
          echo "   -b | --batch_size - batch size per instance"
          echo "   -s | --sequence_len - max sequence length"
-         echo "   --bf16 - whether using hf bf16 inference"
-         echo "   --use_ipex - whether using ipex"
+         echo "   --dtype_ft - data type used for fine-tuning"
+         echo "   --train_epoch - train epoch"
          echo "   -h | --help - displays this message"
          exit
       ;;
     * )
         echo "Invalid option: $1"
-        echo "Usage: ./fine-tuning/train_trainer.sh [OPTIONS]"
+        echo "Usage: $0 [OPTIONS]"
         echo "OPTION includes:"
         echo "   -l | --log_name - the log name of this round"
-        echo "   -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET"
+        echo "   -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET"
         echo "   -b | --batch_size - batch size per instance"
         echo "   -s | --sequence_len - max sequence length"
-        echo "   --bf16 - whether using hf bf16 inference"
-        echo "   --use_ipex - whether using ipex"
+        echo "   --dtype_ft - data type used for fine-tuning"
+        echo "   --train_epoch - train epoch"
         exit
        ;;
   esac
@@ -85,31 +86,29 @@ do
 done
 
 if [ -z "$LOG_NAME" ]; then
-    pre=`date "+%m%d-%H%M"`
+    pre=$(date "+%m%d-%H%M")
 else
     pre=$LOG_NAME
 fi
 
 OUTPUT_DIR=$OUTPUT_DIR'/'$pre'/'$DATASET
-echo $OUTPUT_DIR
-
-mkdir -p $OUTPUT_DIR
+echo "$OUTPUT_DIR"
 
+mkdir -p "$OUTPUT_DIR"/output_test
 
 export CUDA_VISIBLE_DEVICES="-1"; \
-python ./src/run_pt.py \
-        --model_name_or_path $MODEL_NAME_OR_PATH \
-        --dataset $DATASET \
-        --output_dir $OUTPUT_DIR/output_test \
-        --max_seq_len $SEQUENCE_LEN \
-	--num_train_epochs $TRAIN_EPOCH \
-	--do_train \
-	--per_device_train_batch_size $BATCH_SIZE \
+python ./src/run_finetune.py \
+        --model_name_or_path "$MODEL_NAME_OR_PATH" \
+        --dataset "$DATASET" \
+        --output_dir "$OUTPUT_DIR"/output_test \
+        --finetune_impl tpp \
+        --do_train \
+        --dtype_ft "$DTYPE_FT" \
+        --use_tpp --unpad \
+        --max_seq_len "$SEQUENCE_LEN" \
+        --num_train_epochs "$TRAIN_EPOCH" \
+        --per_device_train_batch_size "$BATCH_SIZE" \
         --do_predict \
         --per_device_eval_batch_size 8 \
-	--no_cuda \
-	$BF16 \
-	$USE_IPEX \
-	2>&1 | tee $OUTPUT_DIR/test_$i.log
-
-
+        "$@" \
+        2>&1 | tee "$OUTPUT_DIR"/test.log
diff --git a/profiling-transformers/fine-tuning/ft_tpp_ccl.sh b/profiling-transformers/fine-tuning/ft_tpp_ccl.sh
new file mode 100755
index 0000000..7656aa3
--- /dev/null
+++ b/profiling-transformers/fine-tuning/ft_tpp_ccl.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+
+# Copyright (C) 2022 Intel Corporation                                                                                              
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+
+DATASET="imdb"
+BATCH_SIZE=32
+SEQUENCE_LEN=512
+DTYPE_FT="fp32"
+TRAIN_EPOCH=1
+MODEL_NAME_OR_PATH="bert-large-uncased"
+
+while [ "$1" != "" ];
+do
+   case $1 in
+    -d | --dataset )
+        shift
+        DATASET="$1"
+        echo "dataset is : $DATASET"
+        ;;
+    -o )
+        shift
+        OUTPUT_DIR="$1"
+        echo "output_dir is : $OUTPUT_DIR"
+        ;;
+    -b | --batch_size )
+        shift
+        BATCH_SIZE="$1"
+        echo "batch size per instance is : $BATCH_SIZE"
+        ;;
+    -s | --sequence_len )
+        shift
+        SEQUENCE_LEN="$1"
+        echo "sequence_len is : $SEQUENCE_LEN"
+        ;;
+    --dtype_ft )
+        shift
+        DTYPE_FT="$1"
+        echo "dtype_ft is : $DTYPE_FT"
+        ;;
+    --train_epoch )
+        shift
+        TRAIN_EPOCH="$1"
+        echo "train_epoch is : $TRAIN_EPOCH"
+        ;;
+    -h | --help )
+         echo "Usage: $0 [OPTIONS]"
+         echo "OPTION includes:"
+         echo "   -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET"
+         echo "   -b | --batch_size - batch size per instance"
+         echo "   -s | --sequence_len - max sequence length"
+         echo "   --dtype_ft - data type used for fine-tuning"
+         echo "   --train_epoch - train epoch"
+         echo "   -h | --help - displays this message"
+         exit
+      ;;
+    * )
+        echo "Invalid option: $1"
+        echo "Usage: $0 [OPTIONS]"
+        echo "OPTION includes:"
+        echo "   -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET"
+        echo "   -b | --batch_size - batch size per instance"
+        echo "   -s | --sequence_len - max sequence length"
+        echo "   --dtype_ft - data type used for fine-tuning"
+        echo "   --train_epoch - train epoch"
+        exit
+       ;;
+  esac
+  shift
+done
+
+export CUDA_VISIBLE_DEVICES="-1"; \
+python ./src/run_finetune.py \
+        --model_name_or_path "$MODEL_NAME_OR_PATH" \
+        --dataset "$DATASET" \
+        --output_dir "$OUTPUT_DIR"/output_test \
+        --finetune_impl tpp_ccl \
+        --do_train \
+        --dtype_ft "$DTYPE_FT" \
+        --max_seq_len "$SEQUENCE_LEN" \
+        --num_train_epochs "$TRAIN_EPOCH" \
+        --per_device_train_batch_size "$BATCH_SIZE" \
+        --do_predict \
+        --per_device_eval_batch_size 256 \
+        --use_tpp --unpad 
diff --git a/profiling-transformers/fine-tuning/ft_trainer.sh b/profiling-transformers/fine-tuning/ft_trainer.sh
new file mode 100755
index 0000000..c2f12ff
--- /dev/null
+++ b/profiling-transformers/fine-tuning/ft_trainer.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+# Copyright (C) 2022 Intel Corporation                                                                                              
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+
+LOG_NAME=$(date "+%m%d-%H%M")
+DATASET="sst2"
+BATCH_SIZE=32
+SEQUENCE_LEN=55
+TRAIN_EPOCH=1
+MODEL_NAME_OR_PATH="bert-large-uncased"
+OUTPUT_DIR="${OUTPUT_DIR:-./logs}"
+DTYPE_FT="fp32"
+APPEND=""
+
+while [ "$1" != "" ];
+do
+   case $1 in
+    -l | --log_name )
+        shift
+        LOG_NAME="$1"
+        echo "log name is $LOG_NAME"
+        ;;
+    -d | --dataset )
+        shift
+        DATASET="$1"
+        echo "dataset is : $DATASET"
+        ;;
+    -b | --batch_size )
+        shift
+        BATCH_SIZE="$1"
+        echo "batch size per instance is : $BATCH_SIZE"
+        ;;
+    -s | --sequence_len )
+        shift
+        SEQUENCE_LEN="$1"
+        echo "sequence_len is : $SEQUENCE_LEN"
+        ;;
+    --dtype_ft )
+        shift
+        DTYPE_FT="$1"
+        echo "dtype_ft is : $DTYPE_FT"
+        ;;
+    -h | --help )
+         echo "Usage: $0 [OPTIONS]"
+         echo "OPTION includes:"
+         echo "   -l | --log_name - the log name of this round"
+         echo "   -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET"
+         echo "   -b | --batch_size - batch size per instance"
+         echo "   -s | --sequence_len - max sequence length"
+         echo "   --dtype_ft - data type used for fine-tuning"
+         echo "   -h | --help - displays this message"
+         exit
+      ;;
+    * )
+        echo "Invalid option: $1"
+        echo "Usage: $0 [OPTIONS]"
+        echo "OPTION includes:"
+        echo "   -l | --log_name - the log name of this round"
+        echo "   -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET"
+        echo "   -b | --batch_size - batch size per instance"
+        echo "   -s | --sequence_len - max sequence length"
+        echo "   --dtype_ft - data type used for fine-tuning"
+        exit
+       ;;
+  esac
+  shift
+done
+if [ "$DTYPE_FT" == "bf16" ]; then
+       APPEND="--bf16 --use_ipex"
+fi
+
+if [ -z "$LOG_NAME" ]; then
+    pre=$(date "+%m%d-%H%M")
+else
+    pre=$LOG_NAME
+fi
+
+OUTPUT_DIR=$OUTPUT_DIR'/'$pre'/'$DATASET
+echo "$OUTPUT_DIR"
+
+mkdir -p "$OUTPUT_DIR"/output_test
+
+export CUDA_VISIBLE_DEVICES="-1"; \
+python ./src/run_finetune.py \
+        --model_name_or_path "$MODEL_NAME_OR_PATH" \
+        --dataset "$DATASET" \
+        --output_dir "$OUTPUT_DIR"/output_test \
+        --finetune_impl trainer \
+        --max_seq_len "$SEQUENCE_LEN" \
+        --num_train_epochs $TRAIN_EPOCH \
+        --do_train \
+        --per_device_train_batch_size "$BATCH_SIZE" \
+        --do_predict \
+        --per_device_eval_batch_size 8 \
+        --no_cuda \
+        "$APPEND" \
+        2>&1 | tee "$OUTPUT_DIR"/test.log
diff --git a/profiling-transformers/fine-tuning/run_dist.sh b/profiling-transformers/fine-tuning/run_dist.sh
index 8d2e94d..ebd1ab8 100755
--- a/profiling-transformers/fine-tuning/run_dist.sh
+++ b/profiling-transformers/fine-tuning/run_dist.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (C) 2022 Intel Corporation
+
+# Copyright (C) 2022 Intel Corporation                                                                                              
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,8 +15,8 @@
 # and limitations under the License.
 #
 
-#
 
+OUTPUT_DIR="${OUTPUT_DIR:-./logs}"
 function print_vars {
   for VAR in ${!CCL*} ${!I_MPI*} ${!i_mpi*} ${!KMP_*} ${!OMP_*} ${!ATL_*} LD_PRELOAD ${!DLRM_*} ${!PYTORCH_*} ${!PCL_*} ${!LIBXSMM_*} ${!EMULATE_*} DATALOADER_WORKER_COUNT VIRTUAL_ENV ${!ARGS_*} $@ ; do
     if ! test -z ${!VAR} ; then
@@ -28,6 +29,10 @@ SINGLE_SOCKET_ONLY=0
 
 while (( "$#" )); do
   case "$1" in
+    -l)
+    LOG_NAME=$2
+    shift 2
+    ;;
     -n|-np)
       ARGS_NTASKS=$2
       shift 2
@@ -58,6 +63,17 @@ while (( "$#" )); do
   esac
 done
 
+if [ -z "$LOG_NAME" ]; then
+    pre=$(date "+%m%d-%H%M")
+else
+    pre=$LOG_NAME
+fi
+
+OUTPUT_DIR=$OUTPUT_DIR'/'$pre'/'$DATASET
+echo "$OUTPUT_DIR"
+
+mkdir -p "$OUTPUT_DIR"/output_test
+
 if ! test -z $SLURM_JOB_ID ; then
   PREFIX="srun -n 1 -N 1 "
 else
@@ -195,7 +211,7 @@ echo "Running mpiexec.hydra $@"
 echo "Start Time:  `date`"
 SECONDS=0
 #mpiexec.hydra ${MPIEXE_ARGS} ${CMD} $@
-mpiexec.hydra $@
+mpiexec.hydra $@ -o "$OUTPUT_DIR" 2>&1 | tee "$OUTPUT_DIR"/test.log
 echo "End Time:    `date`"
 duration=$SECONDS
 echo "Total Time: $(($duration / 60)) min and $(($duration % 60)) sec"
diff --git a/profiling-transformers/fine-tuning/run_ipex_native.sh b/profiling-transformers/fine-tuning/run_ipex_native.sh
deleted file mode 100755
index 4e41abf..0000000
--- a/profiling-transformers/fine-tuning/run_ipex_native.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (C) 2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-
-#
-
-# export CUDA_VISIBLE_DEVICES="-1"; \
-MODEL_NAME_OR_PATH="${MODEL_NAME_OR_PATH:-bert-large-uncased}"
-DATASET="${DATASET:-sst2}"
-MAX_SEQ_LEN=55
-NUM_TRAIN_EPOCHS=1
-OUTPUT_DIR="${OUTPUT_DIR:-fine_tuned}"
-TRAINNING_BS=32
-INFERENCE_BS=8
-    #--bf16_ft \
-python src/run_pt_native_ft.py \
-    --model_name_or_path $MODEL_NAME_OR_PATH \
-    --dataset $DATASET \
-    --num_train_epochs $NUM_TRAIN_EPOCHS \
-    --max_seq_len $MAX_SEQ_LEN \
-    --output_dir $OUTPUT_DIR \
-    --do_train \
-    --per_device_train_batch_size $TRAINNING_BS \
-    --do_predict \
-    --per_device_eval_batch_size $INFERENCE_BS \
-    --logging_strategy epoch \
-    $@
diff --git a/profiling-transformers/inference/multi_instance.sh b/profiling-transformers/inference/inf_ipex_multi.sh
similarity index 52%
rename from profiling-transformers/inference/multi_instance.sh
rename to profiling-transformers/inference/inf_ipex_multi.sh
index c5f7965..7ce54b5 100755
--- a/profiling-transformers/inference/multi_instance.sh
+++ b/profiling-transformers/inference/inf_ipex_multi.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Copyright (C) 2022 Intel Corporation
+
+# Copyright (C) 2022 Intel Corporation                                                                                              
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,22 +15,14 @@
 # and limitations under the License.
 #
 
-#
-export KMP_SETTINGS=1
-export KMP_BLOCKTIME=1
-export OMP_MAX_ACTIVE_LEVELS=1
-
-export LOG_NAME=`date "+%m%d-%H%M"`
-export DATASET="sst2"
-export NUMBER_OF_INSTANCE_PER_SOCKET=1
-export BATCH_SIZE=8
-export SEQUENCE_LEN=55
-export IPEX_BF16=0
-export IPEX_FP32=0
-export INT8=0
-export INT8_BF16=0
-export MODEL_NAME_OR_PATH="${MODEL_NAME_OR_PATH:-bert-large-uncased}"
-export OUTPUT_DIR="${OUTPUT_DIR:-./logs}"
+LOG_NAME=$(date "+%m%d-%H%M")
+DATASET="sst2"
+NUMBER_OF_INSTANCE_PER_SOCKET=1
+BATCH_SIZE=8
+SEQUENCE_LEN=55
+DTYPE_INF="fp32"
+MODEL_NAME_OR_PATH="${MODEL_NAME_OR_PATH:-bert-large-uncased}"
+OUTPUT_DIR="${OUTPUT_DIR:-./logs}"
 
 while [ "$1" != "" ]; 
 do
@@ -64,52 +57,35 @@ do
         SEQUENCE_LEN="$1"
         echo "sequence_len is : $SEQUENCE_LEN"
         ;;
-    --ipex_bf16 )
-        IPEX_BF16=1
-        echo "ipex_bf16 is : $IPEX_BF16"
-        ;;
-    --ipex_fp32 )
-        IPEX_FP32=1
-        echo "ipex_fp32 is : $IPEX_FP32"
-        ;;
-    --int8 )
-        INT8=1
-        echo "int8 is : $INT8"
-        ;;
-    --int8_bf16 )
-        INT8_BF16=1
-        echo "int8_bf16 is : $INT8_BF16"
+    --dtype_inf )
+        shift
+        DTYPE_INF="$1"
+        echo "dtype_inf is : $DTYPE_INF"
         ;;
     -h | --help ) 
-         echo "Usage: ./inference/multi_instance.sh [OPTIONS]"
+         echo "Usage: $0 [OPTIONS]"
          echo "OPTION includes:"
          echo "   -l | --log_name - the log name of this round"
-         echo "   -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET"
+         echo "   -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET"
          echo "   -n | --num_of_ins_per_socket - number of instance per socket"
 #         echo "   -c | --cores_per_instance - cores per instance"
          echo "   -b | --batch_size - batch size per instance"
          echo "   -s | --sequence_len - max sequence length"
-         echo "   --ipex_bf16 - wether to use ipex_bf16 precision"
-         echo "   --ipex_fp32 - wether to use ipex_fp32 precision"
-         echo "   --int8 - wether to use int8 precision"
-         echo "   --int8_bf16 - wether to use int8_bf16 precision"
+         echo "   --dtype_inf - data type used for inference"
          echo "   -h | --help - displays this message"
          exit
       ;;
     * ) 
         echo "Invalid option: $1"
-        echo "Usage: inference/multi_instance.sh [OPTIONS]"
+        echo "Usage: $0 [OPTIONS]"
         echo "OPTION includes:"
         echo "   -l | --log_name - the log name of this round"
-        echo "   -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET"
+        echo "   -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET"
         echo "   -n | --num_of_ins_per_socket - number of instance per socket"
 #        echo "   -c | --cores_per_instance - cores per instance"
         echo "   -b | --batch_size - batch size per instance"
         echo "   -s | --sequence_len - max sequence length"
-        echo "   --ipex_bf16 - wether to use ipex_bf16 precision"
-        echo "   --ipex_fp32 - wether to use ipex_fp32 precision"
-        echo "   --int8 - wether to use int8 precision"
-        echo "   --int8_bf16 - wether to use int8_bf16 precision"
+        echo "   --dtype_inf - data type used for inference"
         exit
        ;;
   esac
@@ -117,7 +93,7 @@ do
 done
 
 if [ -z "$LOG_NAME" ]; then
-    pre=`date "+%m%d-%H%M"`
+    pre=$(date "+%m%d-%H%M")
 else
     pre=$LOG_NAME
 fi
@@ -125,7 +101,7 @@ fi
 if [ -z "$DATASET" ]; then
     echo "Error: Please enter the DATASET ot use [imdb|sst2]"
     exit
-elif [ $DATASET != "imdb" -a $DATASET != "sst2" ]; then
+elif [ "$DATASET" != "imdb" ] && [ "$DATASET" != "sst2" ]; then
     echo "Error: The DATASET $DATASET cannot be recognized, please enter 'imdb' or 'sst2'"
     exit
 fi
@@ -140,80 +116,66 @@ fi
 #    exit
 #fi
 
-if [ $IPEX_BF16 = 1 ]; then
-    if [ $INT8 = 1 -o $INT8_BF16 = 1 ]; then
-        echo "Error: Cannot set IPEX_BF16 and INT8 at the same time"
-        exit
-    fi
-else
-    if [ $INT8 = 0 -a $INT8_BF16 = 1 ]; then
-        echo "Error: Cannot set INT8_BF16 without INT8 option"
-        exit
-    fi
-fi
-
 if [ -z "$BATCH_SIZE" ]; then
     echo "Error: Please set the batch size per instance using -b or --BATCH_SIZE"
     exit
 fi
 
-if [ -z $SEQUENCE_LEN ]; then    
-    if [ $DATASET = 'imdb' ]; then
+if [ -z "$SEQUENCE_LEN" ]; then    
+    if [ "$DATASET" = 'imdb' ]; then
         SEQUENCE_LEN=512
-    elif [ $DATASET = 'sst2' ]; then
+    elif [ "$DATASET" = 'sst2' ]; then
         SEQUENCE_LEN=55
     fi
     echo "WARNING: SEQUENCE_LEN is not set, using default DATASET ($DATASET) sequence length $SEQUENCE_LEN"
 fi
 
 
-all_core_number=`cat /proc/cpuinfo |grep "processor"|wc -l`
-socket_number=`lscpu | grep "Socket(s)" | awk '{print $2}'`
-core_number_per_socket=$(($all_core_number / $socket_number))
-instance_number=$(($NUMBER_OF_INSTANCE_PER_SOCKET * $socket_number))
+all_core_number=$(grep -c "processor" /proc/cpuinfo)
+socket_number=$(lscpu | grep "Socket(s)" | awk '{print $2}')
+core_number_per_socket=$((all_core_number / socket_number))
+instance_number=$((NUMBER_OF_INSTANCE_PER_SOCKET * socket_number))
 
-if [ $(($core_number_per_socket % $NUMBER_OF_INSTANCE_PER_SOCKET)) != 0 ]; then
-    echo "\`instance_numberi_per_socket($NUMBER_OF_INSTANCE_PER_SOCKET)\` cannot be divisible by \`core_number_per_socket($core_number_per_socket)\`"
+if [ $((core_number_per_socket % NUMBER_OF_INSTANCE_PER_SOCKET)) != 0 ]; then
+    echo "\`instance_number_per_socket($NUMBER_OF_INSTANCE_PER_SOCKET)\` cannot be divisible by \`core_number_per_socket($core_number_per_socket)\`"
     exit
 else
-    cores_per_instance=$(($core_number_per_socket / $NUMBER_OF_INSTANCE_PER_SOCKET))
+    cores_per_instance=$((core_number_per_socket / NUMBER_OF_INSTANCE_PER_SOCKET))
 fi
 
-if [ $DATASET = 'imdb' ]; then
-    max_test_samples=$((25000/$instance_number))
+if [ "$DATASET" = 'imdb' ]; then
+    max_test_samples=$((25000/instance_number))
 else
-    max_test_samples=$((872/$instance_number))
+    max_test_samples=$((872/instance_number))
 fi
 
 OUTPUT_DIR=$OUTPUT_DIR'/'$pre'/'$DATASET
 echo "log directory is $OUTPUT_DIR"
-mkdir -p $OUTPUT_DIR
+mkdir -p "$OUTPUT_DIR"/output_test
 
 
 for i in $(seq 1 $instance_number)
 do
         export OMP_NUM_THREADS=$cores_per_instance
-        start_index=$(( ($i-1) * $cores_per_instance))
-        end_index=$(( ($i * $cores_per_instance) -1))
-        mem_bind=$(( $start_index / $core_number_per_socket))
+        start_index=$(( (i-1) * cores_per_instance))
+        end_index=$(( (i * cores_per_instance) -1))
+        mem_bind=$(( start_index / core_number_per_socket))
         echo "\`start core index\` is $start_index"
         echo "\`end core index \` is $end_index"
         echo "\`memory bind\` is $mem_bind"
         str="numactl -C $start_index-$end_index -m $mem_bind"
-        echo $str
-        nohup numactl -C $start_index-$end_index -m $mem_bind python ./src/run_pt_native_inf.py \
-                --model_name_or_path $MODEL_NAME_OR_PATH \
-                --dataset $DATASET \
-                --int8 $INT8 \
-                --int8_bf16 $INT8_BF16 \
-                --ipex_bf16 $IPEX_BF16 \
-                --ipex_fp32 $IPEX_FP32 \
+        echo "$str"
+        nohup numactl -C $start_index-$end_index -m $mem_bind python ./src/run_infer.py \
+                --model_name_or_path "$MODEL_NAME_OR_PATH" \
+                --dataset "$DATASET" \
+                --dtype_inf "$DTYPE_INF" \
                 --multi_instance \
-                --output_dir $OUTPUT_DIR/output_test \
+                --output_dir "$OUTPUT_DIR"/output_test \
+                --infer_impl ipex \
                 --do_predict \
                 --max_seq_len $SEQUENCE_LEN \
-                --instance_index $i \
+                --instance_index "$i" \
                 --max_test_samples $max_test_samples \
-                --per_device_eval_batch_size $BATCH_SIZE \
-                > $OUTPUT_DIR/test_$i.log 2>&1 &
+                --per_device_eval_batch_size "$BATCH_SIZE" \
+                > "$OUTPUT_DIR"/test_"$i".log 2>&1 &
 done
diff --git a/profiling-transformers/inference/inf_ipex_single.sh b/profiling-transformers/inference/inf_ipex_single.sh
new file mode 100755
index 0000000..38b3d1a
--- /dev/null
+++ b/profiling-transformers/inference/inf_ipex_single.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+
+# Copyright (C) 2022 Intel Corporation                                                                                              
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+
+LOG_NAME=$(date "+%m%d-%H%M")
+DATASET="sst2"
+BATCH_SIZE=8
+SEQUENCE_LEN=55
+DTYPE_INF="fp32"
+MODEL_NAME_OR_PATH="${MODEL_NAME_OR_PATH:-bert-large-uncased}"
+OUTPUT_DIR="${OUTPUT_DIR:-./logs}"
+
+while [ "$1" != "" ];
+do
+   case $1 in
+    -l | --log_name )
+        shift
+        LOG_NAME="$1"
+        echo "log name is $LOG_NAME"
+        ;;
+    -d | --dataset )
+        shift
+        DATASET="$1"
+        echo "dataset is : $DATASET"
+        ;;
+    -b | --batch_size )
+        shift
+        BATCH_SIZE="$1"
+        echo "batch size per instance is : $BATCH_SIZE"
+        ;;
+    -s | --sequence_len )
+        shift
+        SEQUENCE_LEN="$1"
+        echo "sequence_len is : $SEQUENCE_LEN"
+        ;;
+    --dtype_inf )
+        shift
+        DTYPE_INF="$1"
+        echo "dtype_inf is : $DTYPE_INF"
+        ;;
+    -h | --help )
+         echo "Usage: $0 [OPTIONS]"
+         echo "OPTION includes:"
+         echo "   -l | --log_name - the log name of this round"
+         echo "   -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET"
+         echo "   -b | --batch_size - batch size per instance"
+         echo "   -s | --sequence_len - max sequence length"
+         echo "   --dtype_inf - data type used for inference"
+         echo "   -h | --help - displays this message"
+         exit
+      ;;
+    * )
+        echo "Invalid option: $1"
+        echo "Usage: $0 [OPTIONS]"
+        echo "OPTION includes:"
+        echo "   -l | --log_name - the log name of this round"
+        echo "   -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET"
+        echo "   -b | --batch_size - batch size per instance"
+        echo "   -s | --sequence_len - max sequence length"
+        echo "   --dtype_inf - data type used for inference"
+        exit
+       ;;
+  esac
+  shift
+done
+
+if [ -z "$LOG_NAME" ]; then
+    pre=$(date "+%m%d-%H%M")
+else
+    pre=$LOG_NAME
+fi
+
+OUTPUT_DIR=$OUTPUT_DIR'/'$pre'/'$DATASET
+echo "$OUTPUT_DIR"
+
+mkdir -p "$OUTPUT_DIR"/output_test
+
+
+export CUDA_VISIBLE_DEVICES="-1"; \
+python ./src/run_infer.py \
+        --model_name_or_path "$MODEL_NAME_OR_PATH" \
+        --dataset "$DATASET" \
+        --output_dir "$OUTPUT_DIR"/output_test \
+        --infer_impl ipex \
+        --dtype_inf "$DTYPE_INF" \
+        --do_predict \
+        --max_seq_len "$SEQUENCE_LEN" \
+        --per_device_eval_batch_size "$BATCH_SIZE" \
+        2>&1 | tee "$OUTPUT_DIR"/test.log
+
+
diff --git a/profiling-transformers/inference/cpu_multi_instance.sh b/profiling-transformers/inference/inf_trainer_multi.sh
similarity index 60%
rename from profiling-transformers/inference/cpu_multi_instance.sh
rename to profiling-transformers/inference/inf_trainer_multi.sh
index 807ed99..564f070 100755
--- a/profiling-transformers/inference/cpu_multi_instance.sh
+++ b/profiling-transformers/inference/inf_trainer_multi.sh
@@ -1,4 +1,6 @@
-# Copyright (C) 2022 Intel Corporation
+#!/bin/bash
+
+# Copyright (C) 2022 Intel Corporation                                                                                              
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,21 +15,15 @@
 # and limitations under the License.
 #
 
-#
-
-export KMP_SETTINGS=1
-export KMP_BLOCKTIME=1
-export OMP_MAX_ACTIVE_LEVELS=1
-
-export LOG_NAME=`date "+%m%d-%H%M"`
-export DATASET="sst2"
-export NUMBER_OF_INSTANCE_PER_SOCKET=1
-export BATCH_SIZE=8
-export SEQUENCE_LEN=55
-export MODEL_NAME_OR_PATH="${MODEL_NAME_OR_PATH:-bert-large-uncased}"
-export OUTPUT_DIR="${OUTPUT_DIR:-./logs}"
-export USE_IPEX=""
-export BF16=""
+LOG_NAME=$(date "+%m%d-%H%M")
+DATASET="sst2"
+NUMBER_OF_INSTANCE_PER_SOCKET=1
+BATCH_SIZE=8
+SEQUENCE_LEN=55
+MODEL_NAME_OR_PATH="${MODEL_NAME_OR_PATH:-bert-large-uncased}"
+OUTPUT_DIR="${OUTPUT_DIR:-./logs}"
+DTYPE_INF="fp32"
+APPEND=""
 
 
 while [ "$1" != "" ];
@@ -63,48 +59,47 @@ do
         SEQUENCE_LEN="$1"
         echo "sequence_len is : $SEQUENCE_LEN"
         ;;
-    --use_ipex )
-	USE_IPEX="--use_ipex"
-	echo " use ipex"
-	;;
-    --bf16 )
-	BF16="--bf16"
-        echo "using hf bf16 inference"
-        ;;
+    --dtype_inf )
+       shift
+       DTYPE_INF="$1"
+       echo "dtype_inf is : $DTYPE_INF"    
+       ;;
     -h | --help )
-         echo "Usage: ./inference/cpu_multi_instance.sh [OPTIONS]"
+         echo "Usage: $0 [OPTIONS]"
          echo "OPTION includes:"
          echo "   -l | --log_name - the log name of this round"
-         echo "   -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET"
+         echo "   -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET"
          echo "   -n | --num_of_ins_per_socket - number of instance per socket"
 #         echo "   -c | --cores_per_instance - cores per instance"
          echo "   -b | --batch_size - batch size per instance"
          echo "   -s | --sequence_len - max sequence length"
-         echo "   --bf16 - whether using hf bf16 inference"
-         echo "   --use_ipex - whether using ipex"
+         echo "   --dtype_inf - data type used for inference"
          echo "   -h | --help - displays this message"
          exit
       ;;
     * )
         echo "Invalid option: $1"
-        echo "Usage: inference/cpu_multi_instance.sh [OPTIONS]"
+        echo "Usage: $0 [OPTIONS]"
         echo "OPTION includes:"
         echo "   -l | --log_name - the log name of this round"
-        echo "   -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET"
+        echo "   -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET"
         echo "   -n | --num_of_ins_per_socket - number of instance per socket"
 #        echo "   -c | --cores_per_instance - cores per instance"
         echo "   -b | --batch_size - batch size per instance"
         echo "   -s | --sequence_len - max sequence length"
-        echo "   --bf16 - whether using hf bf16 inference"
-        echo "   --use_ipex - whether using ipex"
+        echo "   --dtype_inf - data type used for inference"
         exit
        ;;
   esac
   shift
 done
 
+if [ "$DTYPE_INF" == "bf16" ]; then
+       APPEND="--bf16 --use_ipex"
+fi
+
 if [ -z "$LOG_NAME" ]; then
-    pre=`date "+%m%d-%H%M"`
+    pre=$(date "+%m%d-%H%M")
 else
     pre=$LOG_NAME
 fi
@@ -112,7 +107,7 @@ fi
 if [ -z "$DATASET" ]; then
     echo "Error: Please enter the DATASET ot use [imdb|sst2]"
     exit
-elif [ $DATASET != "imdb" -a $DATASET != "sst2" ]; then
+elif [ "$DATASET" != "imdb" ] && [ "$DATASET" != "sst2" ]; then
     echo "Error: The DATASET $DATASET cannot be recognized, please enter 'imdb' or 'sst2'"
     exit
 fi
@@ -132,62 +127,61 @@ if [ -z "$BATCH_SIZE" ]; then
     exit
 fi
 
-if [ -z $SEQUENCE_LEN ]; then
-    if [ $DATASET = 'imdb' ]; then
+if [ -z "$SEQUENCE_LEN" ]; then
+    if [ "$DATASET" = 'imdb' ]; then
         SEQUENCE_LEN=512
-    elif [ $DATASET = 'sst2' ]; then
+    elif [ "$DATASET" = 'sst2' ]; then
         SEQUENCE_LEN=55
     fi
     echo "WARNING: SEQUENCE_LEN is not set, using default DATASET ($DATASET) sequence length $SEQUENCE_LEN"
 fi
 
+all_core_number=$(grep -c "processor" /proc/cpuinfo)
+socket_number=$(lscpu | grep "Socket(s)" | awk '{print $2}')
+core_number_per_socket=$((all_core_number / socket_number))
+instance_number=$((NUMBER_OF_INSTANCE_PER_SOCKET * socket_number))
 
-all_core_number=`cat /proc/cpuinfo |grep "processor"|wc -l`
-socket_number=`lscpu | grep "Socket(s)" | awk '{print $2}'`
-core_number_per_socket=$(($all_core_number / $socket_number))
-instance_number=$(($NUMBER_OF_INSTANCE_PER_SOCKET * $socket_number))
-
-if [ $(($core_number_per_socket % $NUMBER_OF_INSTANCE_PER_SOCKET)) != 0 ]; then
-    echo "\`instance_numberi_per_socket($NUMBER_OF_INSTANCE_PER_SOCKET)\` cannot be divisible by \`core_number_per_socket($core_number_per_socket)\`"
+if [ $((core_number_per_socket % NUMBER_OF_INSTANCE_PER_SOCKET)) != 0 ]; then
+    echo "\`instance_number_per_socket($NUMBER_OF_INSTANCE_PER_SOCKET)\` cannot be divisible by \`core_number_per_socket($core_number_per_socket)\`"
     exit
 else
-    cores_per_instance=$(($core_number_per_socket / $NUMBER_OF_INSTANCE_PER_SOCKET))
+    cores_per_instance=$((core_number_per_socket / NUMBER_OF_INSTANCE_PER_SOCKET))
 fi
 
-if [ $DATASET = 'imdb' ]; then
-    max_test_samples=$((25000/$instance_number))
+if [ "$DATASET" = 'imdb' ]; then
+    max_test_samples=$((25000/instance_number))
 else
-    max_test_samples=$((872/$instance_number))
+    max_test_samples=$((872/instance_number))
 fi
 
 OUTPUT_DIR=$OUTPUT_DIR'/'$pre'/'$DATASET
 echo "log directory is $OUTPUT_DIR"
-mkdir -p $OUTPUT_DIR
+mkdir -p "$OUTPUT_DIR"/output_test
 
 
 for i in $(seq 1 $instance_number)
 do
         export OMP_NUM_THREADS=$cores_per_instance
-        start_index=$(( ($i-1) * $cores_per_instance))
-        end_index=$(( ($i * $cores_per_instance) -1))
-        mem_bind=$(( $start_index / $core_number_per_socket))
+        start_index=$(( (i-1) * cores_per_instance))
+        end_index=$(( (i * cores_per_instance) -1))
+        mem_bind=$(( start_index / core_number_per_socket))
         echo "\`start core index\` is $start_index"
         echo "\`end core index \` is $end_index"
         echo "\`memory bind\` is $mem_bind"
         str="numactl -C $start_index-$end_index -m $mem_bind"
-        echo $str
-        nohup numactl -C $start_index-$end_index -m $mem_bind python ./src/run_pt.py \
-                --model_name_or_path $MODEL_NAME_OR_PATH \
-                --dataset $DATASET \
+        echo "$str"
+        nohup numactl -C $start_index-$end_index -m $mem_bind python ./src/run_infer.py \
+                --model_name_or_path "$MODEL_NAME_OR_PATH" \
+                --dataset "$DATASET" \
                 --multi_instance \
-                --output_dir $OUTPUT_DIR/output_test \
+                --output_dir "$OUTPUT_DIR"/output_test \
+                --infer_impl trainer \
                 --do_predict \
                 --max_seq_len $SEQUENCE_LEN \
-                --instance_index $i \
+                --instance_index "$i" \
                 --max_test_samples $max_test_samples \
-                --per_device_eval_batch_size $BATCH_SIZE \
+                --per_device_eval_batch_size "$BATCH_SIZE" \
                 --no_cuda \
-                $USE_IPEX \
-                $BF16 \
-                > $OUTPUT_DIR/test_$i.log 2>&1 &
+                "$APPEND" \
+                > "$OUTPUT_DIR"/test_"$i".log 2>&1 &
 done
diff --git a/profiling-transformers/inference/cpu_single_instance.sh b/profiling-transformers/inference/inf_trainer_single.sh
similarity index 58%
rename from profiling-transformers/inference/cpu_single_instance.sh
rename to profiling-transformers/inference/inf_trainer_single.sh
index cb61c4f..fd6435f 100755
--- a/profiling-transformers/inference/cpu_single_instance.sh
+++ b/profiling-transformers/inference/inf_trainer_single.sh
@@ -1,4 +1,6 @@
-# Copyright (C) 2022 Intel Corporation
+#!/bin/bash
+
+# Copyright (C) 2022 Intel Corporation                                                                                              
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,17 +15,14 @@
 # and limitations under the License.
 #
 
-#
-#
-# export CUDA_VISIBLE_DEVICES="-1"; \
-export LOG_NAME=`date "+%m%d-%H%M"`
-export DATASET="sst2"
-export BATCH_SIZE=8
-export SEQUENCE_LEN=55
-export MODEL_NAME_OR_PATH="${MODEL_NAME_OR_PATH:-bert-large-uncased}"
-export OUTPUT_DIR="${OUTPUT_DIR:-./logs}"
-export USE_IPEX=""
-export BF16=""
+LOG_NAME=$(date "+%m%d-%H%M")
+DATASET="sst2"
+BATCH_SIZE=8
+SEQUENCE_LEN=55
+MODEL_NAME_OR_PATH="${MODEL_NAME_OR_PATH:-bert-large-uncased}"
+OUTPUT_DIR="${OUTPUT_DIR:-./logs}"
+DTYPE_INF="fp32"
+APPEND=""
 
 while [ "$1" != "" ];
 do
@@ -48,64 +47,63 @@ do
         SEQUENCE_LEN="$1"
         echo "sequence_len is : $SEQUENCE_LEN"
         ;;
-    --use_ipex )
-	USE_IPEX="--use_ipex"
-	echo " use ipex"
-	;;
-    --bf16 )
-	BF16="--bf16"
-        echo "using hf bf16 inference"
+    --dtype_inf )
+        shift
+        DTYPE_INF="$1"
+        echo "dtype_inf is : $DTYPE_INF"
         ;;
     -h | --help )
-         echo "Usage: ./inference/cpu_single_instance.sh [OPTIONS]"
+         echo "Usage: $0 [OPTIONS]"
          echo "OPTION includes:"
          echo "   -l | --log_name - the log name of this round"
-         echo "   -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET"
+         echo "   -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET"
          echo "   -b | --batch_size - batch size per instance"
          echo "   -s | --sequence_len - max sequence length"
-         echo "   --bf16 - whether using hf bf16 inference"
-         echo "   --use_ipex - whether using ipex"
+         echo "   --dtype_inf - data type used for inference"
          echo "   -h | --help - displays this message"
          exit
       ;;
     * )
         echo "Invalid option: $1"
-        echo "Usage: ./inference/cpu_single_instance.sh [OPTIONS]"
+        echo "Usage: $0 [OPTIONS]"
         echo "OPTION includes:"
         echo "   -l | --log_name - the log name of this round"
-        echo "   -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET"
+        echo "   -d | --dataset - [imdb|sst2] whether to use imdb or sst2 DATASET"
         echo "   -b | --batch_size - batch size per instance"
         echo "   -s | --sequence_len - max sequence length"
-        echo "   --bf16 - whether using hf bf16 inference"
-        echo "   --use_ipex - whether using ipex"
+        echo "   --dtype_inf - data type used for inference"
         exit
        ;;
   esac
   shift
 done
 
+if [ "$DTYPE_INF" == "bf16" ]; then
+    APPEND="--bf16 --use_ipex"
+fi
+
 if [ -z "$LOG_NAME" ]; then
-    pre=`date "+%m%d-%H%M"`
+    pre=$(date "+%m%d-%H%M")
 else
     pre=$LOG_NAME
 fi
 
 OUTPUT_DIR=$OUTPUT_DIR'/'$pre'/'$DATASET
-echo $OUTPUT_DIR
+echo "$OUTPUT_DIR"
 
-mkdir -p $OUTPUT_DIR
+mkdir -p "$OUTPUT_DIR"/output_test
 
 
 export CUDA_VISIBLE_DEVICES="-1"; \
-python ./src/run_pt.py \
-        --model_name_or_path $MODEL_NAME_OR_PATH \
-        --dataset $DATASET \
-        --output_dir $OUTPUT_DIR/output_test \
+python ./src/run_infer.py \
+        --model_name_or_path "$MODEL_NAME_OR_PATH" \
+        --dataset "$DATASET" \
+        --output_dir "$OUTPUT_DIR"/output_test \
+        --infer_impl trainer \
         --do_predict \
-        --max_seq_len $SEQUENCE_LEN \
-        --per_device_eval_batch_size $BATCH_SIZE \
+        --max_seq_len "$SEQUENCE_LEN" \
+        --per_device_eval_batch_size "$BATCH_SIZE" \
         --no_cuda \
-	$USE_IPEX \
-	$BF16 \
-        2>&1 | tee $OUTPUT_DIR/test_$i.log
+        "$APPEND" \
+        2>&1 | tee "$OUTPUT_DIR"/test.log \
 
diff --git a/profiling-transformers/inference/single_instance.sh b/profiling-transformers/inference/single_instance.sh
deleted file mode 100755
index ea01f64..0000000
--- a/profiling-transformers/inference/single_instance.sh
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright (C) 2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-
-#
-
-export LOG_NAME=`date "+%m%d-%H%M"`
-export DATASET="sst2"
-export BATCH_SIZE=8
-export SEQUENCE_LEN=55
-export IPEX_BF16=0
-export IPEX_FP32=0
-export INT8=0
-export INT8_BF16=0
-export MODEL_NAME_OR_PATH="${MODEL_NAME_OR_PATH:-bert-large-uncased}"
-export OUTPUT_DIR="${OUTPUT_DIR:-./logs}"
-
-while [ "$1" != "" ];
-do
-   case $1 in
-    -l | --log_name )
-        shift
-        LOG_NAME="$1"
-        echo "log name is $LOG_NAME"
-        ;;
-    -d | --dataset )
-        shift
-        DATASET="$1"
-        echo "dataset is : $DATASET"
-        ;;
-    -b | --batch_size )
-        shift
-        BATCH_SIZE="$1"
-        echo "batch size per instance is : $BATCH_SIZE"
-        ;;
-    -s | --sequence_len )
-        shift
-        SEQUENCE_LEN="$1"
-        echo "sequence_len is : $SEQUENCE_LEN"
-        ;;
-    --ipex_bf16 )
-        IPEX_BF16=1
-        echo "ipex_bf16 is : $IPEX_BF16"
-        ;;
-    --ipex_fp32 )
-        IPEX_FP32=1
-        echo "ipex_fp32 is : $IPEX_FP32"
-        ;;
-    --int8 )
-        INT8=1
-        echo "int8 is : $INT8"
-        ;;
-    --int8_bf16 )
-        INT8_BF16=1
-        echo "int8_bf16 is : $INT8_BF16"
-        ;;
-    -h | --help )
-         echo "Usage: ././inference/single_instance.sh [OPTIONS]"
-         echo "OPTION includes:"
-         echo "   -l | --log_name - the log name of this round"
-         echo "   -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET"
-         echo "   -b | --batch_size - batch size per instance"
-         echo "   -s | --sequence_len - max sequence length"
-         echo "   --ipex_bf16 - wether to use ipex_bf16 precision"
-         echo "   --ipex_fp32 - wether to use ipex_fp32 precision"
-         echo "   --int8 - wether to use int8 precision"
-         echo "   --int8_bf16 - wether to use int8_bf16 precision"
-         echo "   -h | --help - displays this message"
-         exit
-      ;;
-    * )
-        echo "Invalid option: $1"
-        echo "Usage: ./inference/single_instance.sh [OPTIONS]"
-        echo "OPTION includes:"
-        echo "   -l | --log_name - the log name of this round"
-        echo "   -d | --dataset - [imdb|sst2] wether to use imdb or sst2 DATASET"
-        echo "   -b | --batch_size - batch size per instance"
-        echo "   -s | --sequence_len - max sequence length"
-        echo "   --ipex_bf16 - wether to use ipex_bf16 precision"
-        echo "   --ipex_fp32 - wether to use ipex_fp32 precision"
-        echo "   --int8 - wether to use int8 precision"
-        echo "   --int8_bf16 - wether to use int8_bf16 precision"
-        exit
-       ;;
-  esac
-  shift
-done
-
-if [ -z "$LOG_NAME" ]; then
-    pre=`date "+%m%d-%H%M"`
-else
-    pre=$LOG_NAME
-fi
-
-OUTPUT_DIR=$OUTPUT_DIR'/'$pre'/'$DATASET
-echo $OUTPUT_DIR
-
-mkdir -p $OUTPUT_DIR
-
-
-export CUDA_VISIBLE_DEVICES="-1"; \
-python ./src/run_pt_native_inf.py \
-        --model_name_or_path $MODEL_NAME_OR_PATH \
-        --dataset $DATASET \
-        --int8 $INT8 \
-        --int8_bf16 $INT8_BF16 \
-        --ipex_bf16 $IPEX_BF16 \
-        --ipex_fp32 $IPEX_FP32 \
-        --output_dir $OUTPUT_DIR/output_test \
-        --do_predict \
-        --max_seq_len $SEQUENCE_LEN \
-        --per_device_eval_batch_size $BATCH_SIZE \
-	2>&1 | tee $OUTPUT_DIR/test_$i.log
-
-
diff --git a/profiling-transformers/install.sh b/profiling-transformers/install.sh
index 4bb6d1f..2a7e6e5 100755
--- a/profiling-transformers/install.sh
+++ b/profiling-transformers/install.sh
@@ -1,5 +1,6 @@
-#!/usr/bin/bash
-# Copyright (C) 2022 Intel Corporation
+#!/bin/bash
+
+# Copyright (C) 2022 Intel Corporation                                                                                              
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +15,7 @@
 # and limitations under the License.
 #
 
-#
-conda install -y pytorch==1.12.1 torchvision torchaudio cpuonly intel-openmp gperftools ninja setuptools tqdm future cmake numpy pyyaml scikit-learn pydot -c pytorch -c intel -c conda-forge
-pip install transformers==4.21.1 datasets==2.3.2 intel_extension_for_pytorch
-bash deploy/install_torch_ccl.sh
\ No newline at end of file
+conda install -y pytorch==1.12.1 torchvision torchaudio cpuonly intel-openmp gperftools ninja setuptools tqdm future cmake numpy pyyaml scikit-learn pydot impi_rt impi-devel -c pytorch -c intel -c conda-forge 
+pip install transformers==4.21.1 datasets==2.3.2 intel_extension_for_pytorch==1.12.300
+bash deploy/install_torch_ccl.sh
+bash deploy/install_tpp.sh
diff --git a/profiling-transformers/src/__init__.py b/profiling-transformers/src/__init__.py
index 356ca7b..e69de29 100644
--- a/profiling-transformers/src/__init__.py
+++ b/profiling-transformers/src/__init__.py
@@ -1,16 +0,0 @@
-# Copyright (C) 2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-
-#
diff --git a/profiling-transformers/src/finetune.py b/profiling-transformers/src/finetune.py
new file mode 100644
index 0000000..fd4b867
--- /dev/null
+++ b/profiling-transformers/src/finetune.py
@@ -0,0 +1,90 @@
+# Copyright (C) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+
+
+from datasets import load_dataset
+from transformers import AutoTokenizer
+
+from utils import Benchmark
+
+
+class DlsaFinetune(object):
+    def __init__(self, **kwargs):
+        self.args = kwargs['args']
+        self.training_args = kwargs['training_args']
+
+        self.max_train, self.max_test = self.args.max_train_samples, self.args.max_test_samples
+        if self.args.smoke_test:
+            self.max_train, self.max_test = 100, 100
+
+        self.bench = Benchmark()
+        self.track = self.bench.track
+
+    def e2e_finetune(self):
+        with self.track('Total Run'):
+            self._load_data()
+            self._preprocess()
+            self._load_model()
+            self._do_finetune()
+            self._do_infer()
+            self.bench.summary()
+
+    def _load_data(self):
+        with self.track('Load Data'):
+            data = load_dataset(self.args.dataset)
+            train_all = data['train']
+            test_split = 'validation' if self.args.dataset == 'sst2' else 'test'
+            len_train = len(train_all)
+            self.train_data = train_all.select(
+                range(len_train - self.max_train, len_train)) if self.max_train else train_all
+            self.test_data = data[test_split].select(range(self.max_test)) if self.max_test else data[test_split]
+            self.text_column = [c for c in self.test_data.column_names if type(self.test_data[c][0]) != int][0]
+
+    def _preprocess(self):
+        with self.track('Pre-process'):
+            with self.track('----Init tokenizer'):
+                self.tokenizer = AutoTokenizer.from_pretrained(
+                    self.args.tokenizer_name if self.args.tokenizer_name else self.args.model_name_or_path
+                )
+
+            max_seq_len = min(self.args.max_seq_len, self.tokenizer.model_max_length)
+
+            with self.track('----Tokenize + Extract Features'):
+                def preprocess(examples):
+                    return self.tokenizer(
+                        examples[self.text_column],
+                        padding='max_length',
+                        truncation=True,
+                        max_length=max_seq_len
+                    )
+
+                kwargs = dict(
+                    function=preprocess,
+                    batched=True,
+                    num_proc=self.args.preprocessing_num_workers,
+                    remove_columns=[self.text_column] + (['idx'] if self.args.dataset == 'sst2' else []),
+                    load_from_cache_file=not self.args.overwrite_cache)
+
+                self.train_data = self.train_data.map(**kwargs) if self.training_args.do_train else None
+                self.test_data = self.test_data.map(**kwargs) if self.training_args.do_predict else None
+
+    def _load_model(self):
+        raise NotImplementedError
+
+    def _do_finetune(self):
+        raise NotImplementedError
+
+    def _do_infer(self):
+        raise NotImplementedError
diff --git a/profiling-transformers/src/finetune_ipex.py b/profiling-transformers/src/finetune_ipex.py
new file mode 100644
index 0000000..7ae8a95
--- /dev/null
+++ b/profiling-transformers/src/finetune_ipex.py
@@ -0,0 +1,108 @@
+# Copyright (C) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+
+
+import intel_extension_for_pytorch as ipex
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import (
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding
+)
+
+from finetune import DlsaFinetune
+from utils import compute_metrics, PredsLabels
+
+
+
+class FinetuneIpex(DlsaFinetune):
+    def _load_data(self):
+        return super()._load_data()
+
+    def _preprocess(self):
+        return super()._preprocess()
+
+    def _load_model(self):
+        if self.training_args.do_train:
+            with self.track('Load Model'):
+                if self.args.dtype_ft == "fp32":
+                    self.model = AutoModelForSequenceClassification \
+                        .from_pretrained(self.args.model_name_or_path)
+                    self.model = ipex.optimize(self.model, dtype=torch.float32, level='O1')
+
+                elif self.args.dtype_ft == "bf16":
+                    with torch.cpu.amp.autocast():
+                        self.model = AutoModelForSequenceClassification \
+                            .from_pretrained(self.args.model_name_or_path)
+                        self.model = ipex.optimize(self.model, dtype=torch.bfloat16, level='O0')
+                else:
+                    error_msg = f'Now only support fp32, bf16.Your input datatype is {self.args.dtype_ft}.'
+                    raise ValueError(error_msg)
+
+    def _do_finetune(self):
+        if self.training_args.do_train:
+            with self.track('Fine-Tune'):
+                with self.track('--------Init Fine-Tuning'):
+                    batch_size = self.training_args.per_device_train_batch_size
+                    self.model.train()
+                    weight_decay = 0.0
+                    no_decay = ["bias", "LayerNorm.weight"]
+                    optimizer_grouped_parameters = [
+                        {
+                            "params": [p for n, p in self.model.named_parameters() if
+                                       not any(nd in n for nd in no_decay)],
+                            "weight_decay": weight_decay,
+                        },
+                        {
+                            "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
+                            "weight_decay": 0.0,
+                        },
+                    ]
+                    optim = torch.optim.AdamW(optimizer_grouped_parameters, lr=self.training_args.learning_rate)
+
+                with self.track('--------Training Loop'):
+                    for _ in tqdm(range(int(self.training_args.num_train_epochs)), desc='Epoch'):
+                        for batch in tqdm(DataLoader(self.train_data, batch_size=batch_size, shuffle=True,
+                                                     collate_fn=DataCollatorWithPadding(self.tokenizer)),
+                                          desc='Train Step'):
+                            optim.zero_grad()
+                            loss = self.model(**batch)[0]
+                            loss.backward()
+                            optim.step()
+
+                with self.track('--------Save Fine-Tuned Model'):
+                    torch.save(self.model.state_dict(), self.training_args.output_dir + "/pytorch_model.bin")
+
+    def _do_infer(self):
+        if self.training_args.do_predict:
+            with self.track('Inference'):
+                batch_size = self.training_args.per_device_eval_batch_size
+                all_outputs, all_labels = [], []
+
+                def prediction_step(batch, labels):
+                    all_labels.extend(labels)
+                    inputs = batch
+                    output = self.model(**inputs)
+                    all_outputs.append(output['logits'].detach().cpu())
+
+                self.model.eval()
+                with torch.no_grad():
+                    for batch in tqdm(DataLoader(self.test_data, batch_size=batch_size,
+                                                 collate_fn=DataCollatorWithPadding(self.tokenizer)), desc='Test Step'):
+                        prediction_step(batch=batch, labels=batch.pop('labels'))
+                    acc = compute_metrics(PredsLabels(preds=np.concatenate(all_outputs), labels=all_labels))
+                    print(f"\n*********** TEST_METRICS ***********\nAccuracy: {acc['acc']}\n")
diff --git a/profiling-transformers/src/finetune_ipex_dist.py b/profiling-transformers/src/finetune_ipex_dist.py
new file mode 100644
index 0000000..d74a487
--- /dev/null
+++ b/profiling-transformers/src/finetune_ipex_dist.py
@@ -0,0 +1,138 @@
+# Copyright (C) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+
+import os
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, RandomSampler
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm
+from transformers import (
+    set_seed,
+    DataCollatorWithPadding
+)
+
+from finetune_ipex import FinetuneIpex
+from utils import compute_metrics, PredsLabels
+
+
+
+class FinetuneIpexDist(FinetuneIpex):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        set_seed(self.training_args.seed)
+
+        if int(os.environ.get('PMI_SIZE', '0')) > 1 and not self.args.multi_instance:
+            if self.args.dist_backend == 'ccl':
+                try:
+                    import oneccl_bindings_for_pytorch
+                except ImportError:
+                    print("CCL backend requested but import oneccl_bindings_for_pytorch failed")
+                    raise
+            elif self.args.dist_backend == 'mpi':
+                if not torch.distributed.is_mpi_available():
+                    try:
+                        import torch_mpi
+                    except ImportError:
+                        print("MPI backend requested but not available try installing torch_mpi module")
+                        raise
+            else:
+                raise ValueError(f"{self.args.dist_backend} backend requested but not supported")
+
+            os.environ['RANK'] = os.environ.get('PMI_RANK', '0')
+            os.environ['WORLD_SIZE'] = os.environ.get('PMI_SIZE', '1')
+            torch.distributed.init_process_group(backend=self.args.dist_backend)
+            self.training_args.local_rank = torch.distributed.get_rank()
+            if self.training_args.local_rank == 0:
+                print(
+                    f"##################Using {self.args.dist_backend.upper()} dist "
+                    f"run with {torch.distributed.get_world_size()} ranks",
+                    flush=True)
+
+    def _load_data(self):
+        return super()._load_data()
+
+    def _preprocess(self):
+        return super()._preprocess()
+
+    def _load_model(self):
+        return super()._load_model()
+
+    def _do_finetune(self):
+        if self.training_args.do_train:
+            with self.track('Fine-Tune'):
+                with self.track('--------Init Fine-Tuning'):
+                    batch_size = self.training_args.per_device_train_batch_size
+                    self.model.train()
+                    weight_decay = 0.0
+                    no_decay = ["bias", "LayerNorm.weight"]
+                    optimizer_grouped_parameters = [
+                        {
+                            "params": [p for n, p in self.model.named_parameters() if
+                                       not any(nd in n for nd in no_decay)],
+                            "weight_decay": weight_decay,
+                        },
+                        {
+                            "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
+                            "weight_decay": 0.0,
+                        },
+                    ]
+                    optim = torch.optim.AdamW(optimizer_grouped_parameters, lr=self.training_args.learning_rate)
+                    if self.training_args.local_rank != -1:
+                        self.model = torch.nn.parallel.DistributedDataParallel(self.model)
+
+                with self.track('--------Training Loop'):
+                    train_sampler = RandomSampler(
+                        self.train_data) if self.training_args.local_rank == -1 else DistributedSampler(
+                        self.train_data)
+
+                    for _ in tqdm(range(int(self.training_args.num_train_epochs)), desc='Epoch',
+                                  disable=self.training_args.local_rank not in [-1, 0]):
+                        for batch in tqdm(DataLoader(self.train_data, sampler=train_sampler, batch_size=batch_size,
+                                                     collate_fn=DataCollatorWithPadding(self.tokenizer)),
+                                          desc='Train Step', disable=self.training_args.local_rank not in [-1, 0]):
+                            optim.zero_grad()
+                            loss = self.model(**batch)[0]
+                            loss.backward()
+                            optim.step()
+
+                with self.track('--------Save Fine-Tuned Model'):
+                    torch.save(self.model.state_dict(), self.training_args.output_dir + "/pytorch_model.bin")
+
+    def _do_infer(self):
+        if self.training_args.do_predict:
+            with self.track('Inference'):
+                batch_size = self.training_args.per_device_eval_batch_size
+                all_outputs, all_labels = [], []
+
+                def prediction_step(batch, labels):
+                    all_labels.extend(labels)
+                    inputs = batch
+                    output = self.model(**inputs)
+                    all_outputs.append(output['logits'].detach().cpu())
+
+                self.model.eval()
+                with torch.no_grad():
+                    test_sampler = RandomSampler(
+                        self.test_data) if self.training_args.local_rank == -1 else DistributedSampler(
+                        self.test_data)
+
+                    for batch in tqdm(DataLoader(self.test_data, sampler=test_sampler, batch_size=batch_size,
+                                                 collate_fn=DataCollatorWithPadding(self.tokenizer)),
+                                      desc='Test Step'):
+                        prediction_step(batch=batch, labels=batch.pop('labels'))
+                    acc = compute_metrics(PredsLabels(preds=np.concatenate(all_outputs), labels=all_labels))
+                    print(f"\n*********** TEST_METRICS ***********\nAccuracy: {acc['acc']}\n")
diff --git a/profiling-transformers/src/finetune_tpp.py b/profiling-transformers/src/finetune_tpp.py
new file mode 100644
index 0000000..0916c53
--- /dev/null
+++ b/profiling-transformers/src/finetune_tpp.py
@@ -0,0 +1,109 @@
+# Copyright (C) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from tpp_pytorch_extension import bert as tpp_bert
+from tqdm import tqdm
+from transformers import (
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding
+)
+
+from finetune import DlsaFinetune
+from utils import compute_metrics, PredsLabels
+
+
+
+class FinetuneTpp(DlsaFinetune):
+
+    def _load_data(self):
+        return super()._load_data()
+
+    def _preprocess(self):
+        return super()._preprocess()
+
+    def _load_model(self):
+        if self.training_args.do_train:
+            with self.track('Load Model'):
+                if self.args.dtype_ft == "fp32":
+                    with tpp_bert.tpp_impl(self.args.use_tpp, False, self.args.unpad):
+                        self.model = AutoModelForSequenceClassification.from_pretrained(self.args.model_name_or_path)
+
+                elif self.args.dtype_ft == "bf16":
+                    with tpp_bert.tpp_impl(self.args.use_tpp, True, self.args.unpad):
+                        self.model = AutoModelForSequenceClassification.from_pretrained(self.args.model_name_or_path)
+                else:
+                    error_msg = f'Now only support fp32, bf16.Your input datatype is {self.args.dtype_ft}.'
+                    raise ValueError(error_msg)
+
+                if self.args.use_tpp:
+                    tpp_bert.block(self.model)
+
+    def _do_finetune(self):
+        if self.training_args.do_train:
+            with self.track('Fine-Tune'):
+                with self.track('--------Init Fine-Tuning'):
+                    batch_size = self.training_args.per_device_train_batch_size
+                    self.model.train()
+                    weight_decay = 0.0
+                    no_decay = ["bias", "LayerNorm.weight"]
+                    optimizer_grouped_parameters = [
+                        {
+                            "params": [p for n, p in self.model.named_parameters() if
+                                       not any(nd in n for nd in no_decay)],
+                            "weight_decay": weight_decay,
+                        },
+                        {
+                            "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
+                            "weight_decay": 0.0,
+                        },
+                    ]
+                    optim = tpp_bert.AdamW(optimizer_grouped_parameters, lr=self.training_args.learning_rate)
+
+                with self.track('--------Training Loop'):
+                    for _ in tqdm(range(int(self.training_args.num_train_epochs)), desc='Epoch'):
+                        for batch in tqdm(DataLoader(self.train_data, batch_size=batch_size, shuffle=True,
+                                                     collate_fn=DataCollatorWithPadding(self.tokenizer)),
+                                          desc='Train Step'):
+                            optim.zero_grad()
+                            loss = self.model(**batch)[0]
+                            loss.backward()
+                            optim.step()
+
+                with self.track('--------Save Fine-Tuned Model'):
+                    torch.save(self.model.state_dict(), self.training_args.output_dir + "/pytorch_model.bin")
+
+    def _do_infer(self):
+        if self.training_args.do_predict:
+            with self.track('Inference'):
+                batch_size = self.training_args.per_device_eval_batch_size
+                all_outputs, all_labels = [], []
+
+                def prediction_step(batch, labels):
+                    all_labels.extend(labels)
+                    inputs = batch
+                    output = self.model(**inputs)
+                    all_outputs.append(output['logits'].detach().cpu())
+
+                self.model.eval()
+                with torch.no_grad():
+                    for batch in tqdm(DataLoader(self.test_data, batch_size=batch_size,
+                                                 collate_fn=DataCollatorWithPadding(self.tokenizer)), desc='Test Step'):
+                        prediction_step(batch=batch, labels=batch.pop('labels'))
+                    acc = compute_metrics(PredsLabels(preds=np.concatenate(all_outputs), labels=all_labels))
+                    print(f"\n*********** TEST_METRICS ***********\nAccuracy: {acc['acc']}\n")
diff --git a/profiling-transformers/src/finetune_tpp_dist.py b/profiling-transformers/src/finetune_tpp_dist.py
new file mode 100644
index 0000000..0238ce8
--- /dev/null
+++ b/profiling-transformers/src/finetune_tpp_dist.py
@@ -0,0 +1,139 @@
+# Copyright (C) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+
+import os
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, RandomSampler
+from torch.utils.data.distributed import DistributedSampler
+from tpp_pytorch_extension import bert as tpp_bert
+from tqdm import tqdm
+from transformers import (
+    DataCollatorWithPadding,
+    set_seed
+)
+
+from finetune_tpp import FinetuneTpp
+from utils import compute_metrics, PredsLabels
+
+
+
+class FinetuneTppDist(FinetuneTpp):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        set_seed(self.training_args.seed)
+
+        if int(os.environ.get('PMI_SIZE', '0')) > 1 and not self.args.multi_instance:
+            if self.args.dist_backend == 'ccl':
+                try:
+                    import oneccl_bindings_for_pytorch
+                except ImportError:
+                    print("CCL backend requested but import oneccl_bindings_for_pytorch failed")
+                    raise
+            elif self.args.dist_backend == 'mpi':
+                if not torch.distributed.is_mpi_available():
+                    try:
+                        import torch_mpi
+                    except ImportError:
+                        print("MPI backend requested but not available try installing torch_mpi module")
+                        raise
+            else:
+                raise ValueError(f"{self.args.dist_backend} backend requested but not supported")
+
+            os.environ['RANK'] = os.environ.get('PMI_RANK', '0')
+            os.environ['WORLD_SIZE'] = os.environ.get('PMI_SIZE', '1')
+            torch.distributed.init_process_group(backend=self.args.dist_backend)
+            self.training_args.local_rank = torch.distributed.get_rank()
+            if self.training_args.local_rank == 0:
+                print(
+                    f"##################Using {self.args.dist_backend.upper()} dist "
+                    f"run with {torch.distributed.get_world_size()} ranks",
+                    flush=True)
+
+    def _load_data(self):
+        return super()._load_data()
+
+    def _preprocess(self):
+        return super()._preprocess()
+
+    def _load_model(self):
+        return super()._load_model()
+
+    def _do_finetune(self):
+        if self.training_args.do_train:
+            with self.track('Fine-Tune'):
+                with self.track('--------Init Fine-Tuning'):
+                    batch_size = self.training_args.per_device_train_batch_size
+                    self.model.train()
+                    weight_decay = 0.0
+                    no_decay = ["bias", "LayerNorm.weight"]
+                    optimizer_grouped_parameters = [
+                        {
+                            "params": [p for n, p in self.model.named_parameters() if
+                                       not any(nd in n for nd in no_decay)],
+                            "weight_decay": weight_decay,
+                        },
+                        {
+                            "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
+                            "weight_decay": 0.0,
+                        },
+                    ]
+                    optim = tpp_bert.AdamW(optimizer_grouped_parameters, lr=self.training_args.learning_rate)
+                    if self.training_args.local_rank != -1:
+                        self.model = torch.nn.parallel.DistributedDataParallel(self.model)
+
+                with self.track('--------Training Loop'):
+                    train_sampler = RandomSampler(
+                        self.train_data) if self.training_args.local_rank == -1 else DistributedSampler(
+                        self.train_data)
+
+                    for _ in tqdm(range(int(self.training_args.num_train_epochs)), desc='Epoch',
+                                  disable=self.training_args.local_rank not in [-1, 0]):
+                        for batch in tqdm(DataLoader(self.train_data, sampler=train_sampler, batch_size=batch_size,
+                                                     collate_fn=DataCollatorWithPadding(self.tokenizer)),
+                                          desc='Train Step', disable=self.training_args.local_rank not in [-1, 0]):
+                            optim.zero_grad()
+                            loss = self.model(**batch)[0]
+                            loss.backward()
+                            optim.step()
+
+                with self.track('--------Save Fine-Tuned Model'):
+                    torch.save(self.model.state_dict(), self.training_args.output_dir + "/pytorch_model.bin")
+
+    def _do_infer(self):
+        if self.training_args.do_predict:
+            with self.track('Inference'):
+                batch_size = self.training_args.per_device_eval_batch_size
+                all_outputs, all_labels = [], []
+
+                def prediction_step(batch, labels):
+                    all_labels.extend(labels)
+                    inputs = batch
+                    output = self.model(**inputs)
+                    all_outputs.append(output['logits'].detach().cpu())
+
+                self.model.eval()
+                with torch.no_grad():
+                    test_sampler = RandomSampler(
+                        self.test_data) if self.training_args.local_rank == -1 else DistributedSampler(
+                        self.test_data)
+
+                    for batch in tqdm(DataLoader(self.test_data, sampler=test_sampler, batch_size=batch_size,
+                                                 collate_fn=DataCollatorWithPadding(self.tokenizer)),
+                                      desc='Test Step'):
+                        prediction_step(batch=batch, labels=batch.pop('labels'))
+                    acc = compute_metrics(PredsLabels(preds=np.concatenate(all_outputs), labels=all_labels))
+                    print(f"\n*********** TEST_METRICS ***********\nAccuracy: {acc['acc']}\n")
diff --git a/profiling-transformers/src/finetune_trainer.py b/profiling-transformers/src/finetune_trainer.py
new file mode 100644
index 0000000..e7f8d71
--- /dev/null
+++ b/profiling-transformers/src/finetune_trainer.py
@@ -0,0 +1,59 @@
+# Copyright (C) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+
+
+from transformers import (
+    AutoModelForSequenceClassification,
+    Trainer
+)
+
+from finetune import DlsaFinetune
+from utils import compute_metrics, save_test_metrics, save_train_metrics
+
+
+
+class FinetuneTrainer(DlsaFinetune):
+    def _load_data(self):
+        return super()._load_data()
+
+    def _preprocess(self):
+        return super()._preprocess()
+
+    def _load_model(self):
+        with self.track('Load Model'):
+            self.model = AutoModelForSequenceClassification.from_pretrained(self.args.model_name_or_path)
+
+            self.trainer = Trainer(
+                model=self.model,  # the instantiated HF model to be trained
+                args=self.training_args,  # training arguments, defined above
+                train_dataset=self.train_data,  # training dataset
+                compute_metrics=compute_metrics,  # evaluation metrics
+                tokenizer=self.tokenizer
+            )
+
+    def _do_finetune(self):
+        if self.training_args.do_train:
+            with self.track('Fine-Tune'):
+                train_result = self.trainer.train()
+                self.trainer.save_model()
+                save_train_metrics(train_result, self.trainer, len(self.train_data))
+
+    def _do_infer(self):
+        test_metrics = ""
+        if self.training_args.do_predict:
+            with self.track('Inference'):
+                preds, _, metrics = self.trainer.predict(self.test_data)
+                test_metrics = save_test_metrics(metrics, len(self.test_data), self.training_args.output_dir)
+        print(test_metrics)
diff --git a/profiling-transformers/src/infer.py b/profiling-transformers/src/infer.py
new file mode 100644
index 0000000..7da7e43
--- /dev/null
+++ b/profiling-transformers/src/infer.py
@@ -0,0 +1,91 @@
+# Copyright (C) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+
+
+from datasets import load_dataset
+from transformers import AutoTokenizer
+
+from utils import Benchmark
+
+
+
+class DlsaInference(object):
+    def __init__(self, **kwargs):
+        self.args = kwargs['args']
+        self.training_args = kwargs['training_args']
+
+        self.max_train, self.max_test = self.args.max_train_samples, self.args.max_test_samples
+        if self.args.smoke_test:
+            self.max_train, self.max_test = 100, 100
+
+        self.bench = Benchmark()
+        self.track = self.bench.track
+
+    def e2e_infer(self):
+        with self.track('Total Run'):
+            self._load_data()
+            self._preprocess()
+            self._load_model()
+            self._do_infer()
+            self.bench.summary()
+
+    def _load_data(self):
+        with self.track('Load Data'):
+            data = load_dataset(self.args.dataset)
+            test_split = 'validation' if self.args.dataset == 'sst2' else 'test'
+            if self.args.multi_instance:
+                start_index = (self.args.instance_index - 1) * self.args.max_test_samples
+                end_index = self.args.instance_index * self.args.max_test_samples
+                self.test_data = data[test_split].select(range(start_index, end_index))
+                print("start_index is ", start_index)
+                print("end_index is ", end_index)
+                print("test length is ", len(self.test_data))
+            else:
+                self.test_data = data[test_split].select(range(self.max_test)) if self.max_test else data[test_split]
+
+            self.text_column = [c for c in self.test_data.column_names if type(self.test_data[c][0]) != int][0]
+
+    def _preprocess(self):
+        with self.track('Pre-process'):
+            with self.track('----Init tokenizer'):
+                self.tokenizer = AutoTokenizer.from_pretrained(
+                    self.args.tokenizer_name if self.args.tokenizer_name else self.args.model_name_or_path
+                )
+
+            self.max_seq_len = min(self.args.max_seq_len, self.tokenizer.model_max_length)
+
+            with self.track('----Tokenize + Extract Features'):
+                def preprocess(examples):
+                    return self.tokenizer(
+                        examples[self.text_column],
+                        padding='max_length',
+                        truncation=True,
+                        max_length=self.max_seq_len
+                    )
+
+                kwargs = dict(
+                    function=preprocess,
+                    batched=True,
+                    num_proc=self.args.preprocessing_num_workers,
+                    remove_columns=[self.text_column] + (['idx'] if self.args.dataset == 'sst2' else []),
+                    load_from_cache_file=not self.args.overwrite_cache)
+
+                self.test_data = self.test_data.map(**kwargs) if self.training_args.do_predict else None
+
+    def _load_model(self):
+        raise NotImplementedError
+
+    def _do_infer(self):
+        raise NotImplementedError
diff --git a/profiling-transformers/src/infer_ipex.py b/profiling-transformers/src/infer_ipex.py
new file mode 100644
index 0000000..d00c530
--- /dev/null
+++ b/profiling-transformers/src/infer_ipex.py
@@ -0,0 +1,136 @@
+# Copyright (C) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+
+import os
+
+import intel_extension_for_pytorch as ipex
+import numpy as np
+import torch
+from torch import tensor
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import (
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding
+)
+
+from infer import DlsaInference
+from utils import compute_metrics, PredsLabels
+
+
+class IpexInfer(DlsaInference):
+    def _load_data(self):
+        return super()._load_data()
+
+    def _preprocess(self):
+        return super()._preprocess()    
+
+    def _load_model(self):
+        with self.track('Load Model'):
+            self.model = AutoModelForSequenceClassification.from_pretrained(self.args.model_name_or_path)
+
+        if self.args.dtype_inf == 'fp32':
+            self.model = ipex.optimize(self.model, dtype=torch.float32, level='O1')
+
+        elif self.args.dtype_inf == 'bf16':
+            with self.track("Process bf16 model"):
+                self.model = ipex.optimize(self.model, dtype=torch.bfloat16, level='O0')
+                dumpy_tensor = torch.ones((self.training_args.per_device_eval_batch_size, self.max_seq_len),
+                                          dtype=torch.long)
+                jit_inputs = (dumpy_tensor, dumpy_tensor, dumpy_tensor)
+                with torch.cpu.amp.autocast(), torch.no_grad():
+                    self.model = torch.jit.trace(self.model, jit_inputs, strict=False)
+                    self.model = torch.jit.freeze(self.model)
+                with torch.no_grad():
+                    y = self.model(dumpy_tensor, dumpy_tensor, dumpy_tensor)
+                    y = self.model(dumpy_tensor, dumpy_tensor, dumpy_tensor)
+
+        elif self.args.dtype_inf == 'int8':
+            with self.track("Process int8 model"):
+                # convert fp32 model to int 8
+                dumpy_tensor = torch.ones((self.training_args.per_device_eval_batch_size, self.max_seq_len),
+                                          dtype=torch.long)
+                jit_inputs = (dumpy_tensor, dumpy_tensor, dumpy_tensor)
+
+                if os.path.exists(self.args.model_name_or_path + "/quantized_model.pt"):
+                    print("load int8 model-----------------------")
+                    with torch.cpu.amp.autocast():
+                        self.model = torch.jit.load(self.args.model_name_or_path + "/quantized_model.pt")
+                        self.model = torch.jit.freeze(self.model.eval())
+                else:
+                    print("load configure and convert the model")
+                    ipex.nn.utils._model_convert.replace_dropout_with_identity(self.model)
+                    from intel_extension_for_pytorch.quantization import prepare, convert
+                    from torch.ao.quantization import MinMaxObserver, PerChannelMinMaxObserver, QConfig
+                    qconfig = QConfig(
+                        activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_affine, dtype=torch.quint8),
+                        weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8,
+                                                                  qscheme=torch.per_channel_symmetric))
+                    prepared_model = prepare(self.model, qconfig, example_inputs=jit_inputs, inplace=False)
+                    prepared_model.load_qconf_summary(
+                        qconf_summary=self.args.model_name_or_path + "/int8_configure.json")
+                    with torch.cpu.amp.autocast():
+                        self.model = convert(prepared_model)
+                        self.model = torch.jit.trace(self.model, jit_inputs, strict=False)
+                    self.model = torch.jit.freeze(self.model)
+
+                with torch.no_grad():
+                    y = self.model(dumpy_tensor, dumpy_tensor, dumpy_tensor)
+                    y = self.model(dumpy_tensor, dumpy_tensor, dumpy_tensor)
+
+        else:
+            error_msg = f'Now only support fp32, bf16 and int8.Your input datatype is {self.args.dtype_inf}.'
+            raise ValueError(error_msg)
+
+    def _do_infer(self):
+        if self.training_args.do_predict:
+            with self.track('Inference'):
+                batch_size = self.training_args.per_device_eval_batch_size
+                all_outputs, all_labels = [], []
+
+                device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+                def to_inputs(batch: dict) -> dict:
+                    return {k: (v if torch.is_tensor(v) else tensor(v)).to(device=device) for k, v in batch.items()}
+
+                def prediction_step(batch, labels):
+                    all_labels.extend(labels)
+                    inputs = to_inputs(batch)
+                    output = self.model(**inputs)
+                    all_outputs.append(output['logits'].detach().cpu())
+
+                self.model.eval()
+
+                with torch.no_grad():
+                    if self.args.profiler:
+                        with torch.profiler.profile(
+                                schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=2),
+                                on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                                    './profiler/' + self.args.profiler_name),
+                                record_shapes=True,
+                                profile_memory=True,
+                                with_stack=True
+                        ) as prof:
+                            for batch in tqdm(DataLoader(self.test_data, batch_size=batch_size,
+                                                         collate_fn=DataCollatorWithPadding(self.tokenizer))):
+                                prediction_step(batch=batch, labels=batch.pop('labels'))
+                                prof.step()
+                    else:
+                        for batch in tqdm(DataLoader(self.test_data, batch_size=batch_size,
+                                                     collate_fn=DataCollatorWithPadding(self.tokenizer))):
+                            prediction_step(batch=batch, labels=batch.pop('labels'))
+
+                    acc = compute_metrics(PredsLabels(preds=np.concatenate(all_outputs), labels=all_labels))
+                    print(f"\n*********** TEST_METRICS ***********\nAccuracy: {acc['acc']}\n")
diff --git a/profiling-transformers/src/infer_trainer.py b/profiling-transformers/src/infer_trainer.py
new file mode 100644
index 0000000..aa3607d
--- /dev/null
+++ b/profiling-transformers/src/infer_trainer.py
@@ -0,0 +1,50 @@
+# Copyright (C) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+
+
+from transformers import (
+    AutoModelForSequenceClassification,
+    Trainer
+)
+
+from infer import DlsaInference
+from utils import compute_metrics, save_test_metrics
+
+
+class TrainerInfer(DlsaInference):
+    def _load_data(self):
+        return super()._load_data()
+
+    def _preprocess(self):
+        return super()._preprocess()
+
+    def _load_model(self):
+        with self.track('Load Model'):
+            self.model = AutoModelForSequenceClassification.from_pretrained(self.args.model_name_or_path)
+
+            self.trainer = Trainer(
+                model=self.model,  # the instantiated HF model to be trained
+                args=self.training_args,  # training arguments, defined above
+                compute_metrics=compute_metrics,  # evaluation metrics
+                tokenizer=self.tokenizer
+            )
+
+    def _do_infer(self):
+        test_metrics = ""
+        if self.training_args.do_predict:
+            with self.track('Inference'):
+                preds, _, metrics = self.trainer.predict(self.test_data)
+                test_metrics = save_test_metrics(metrics, len(self.test_data), self.training_args.output_dir)
+        print(test_metrics)
diff --git a/profiling-transformers/src/run_finetune.py b/profiling-transformers/src/run_finetune.py
new file mode 100644
index 0000000..8f0e874
--- /dev/null
+++ b/profiling-transformers/src/run_finetune.py
@@ -0,0 +1,55 @@
+# Copyright (C) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+
+
+from transformers import HfArgumentParser, TrainingArguments
+from transformers import logging as hf_logging
+
+from utils import Arguments
+
+hf_logging.set_verbosity_info()
+
+
+def main():
+    parser = HfArgumentParser((Arguments, TrainingArguments))
+    args, training_args = parser.parse_args_into_dataclasses()
+    kwargs = {'args': args, 'training_args': training_args}
+
+    if args.finetune_impl == 'trainer':
+        from finetune_trainer import FinetuneTrainer
+        finetune = FinetuneTrainer(**kwargs)
+    elif args.finetune_impl == 'ipex':
+        from finetune_ipex import FinetuneIpex
+        finetune = FinetuneIpex(**kwargs)
+    elif args.finetune_impl == 'ipex_ccl':
+        from finetune_ipex_dist import FinetuneIpexDist
+        finetune = FinetuneIpexDist(**kwargs)
+    elif args.finetune_impl == 'tpp':
+        from finetune_tpp import FinetuneTpp
+        finetune = FinetuneTpp(**kwargs)
+    elif args.finetune_impl == 'tpp_ccl':
+        from finetune_tpp_dist import FinetuneTppDist
+        finetune = FinetuneTppDist(**kwargs)
+    else:
+        error_msg = f'Now only support trainer, ipex, ipex_ccl, tpp and tpp_ccl implementations ' \
+                    f'for DLSA fine-tuning pipeline. ' \
+                    f'Your input datatype is {args.finetune_impl}.'
+        raise ValueError(error_msg)
+
+    finetune.e2e_finetune()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/profiling-transformers/src/run_infer.py b/profiling-transformers/src/run_infer.py
new file mode 100644
index 0000000..7a7f0b2
--- /dev/null
+++ b/profiling-transformers/src/run_infer.py
@@ -0,0 +1,43 @@
+# Copyright (C) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+
+
+from transformers import HfArgumentParser, TrainingArguments
+from utils import Arguments
+from transformers import logging as hf_logging
+
+hf_logging.set_verbosity_info()
+
+
+def main():
+    parser = HfArgumentParser((Arguments, TrainingArguments))
+    args, training_args = parser.parse_args_into_dataclasses()
+    kwargs = {'args': args, 'training_args': training_args}
+
+    if args.infer_impl == 'trainer':
+        from infer_trainer import TrainerInfer
+        infer = TrainerInfer(**kwargs)
+    elif args.infer_impl == 'ipex':
+        from infer_ipex import IpexInfer
+        infer = IpexInfer(**kwargs)
+    else:
+        error_msg = f'Now only support trainer and ipex implementation for DLSA inference pipeline.'
+        raise ValueError(error_msg)
+    
+    infer.e2e_infer()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/profiling-transformers/src/run_pt.py b/profiling-transformers/src/run_pt.py
deleted file mode 100644
index 7e5010b..0000000
--- a/profiling-transformers/src/run_pt.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Copyright (C) 2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-
-#
-
-import logging
-
-from datasets import load_dataset
-from transformers import (
-    logging as hf_logging,
-    HfArgumentParser,
-    AutoTokenizer,
-    AutoModelForSequenceClassification,
-    Trainer,
-    TrainingArguments,
-)
-
-from utils import (
-    Arguments,
-    Benchmark,
-    compute_metrics,
-    save_train_metrics,
-    save_test_metrics,
-    check_dataset
-)
-
-hf_logging.set_verbosity_info()
-logger = logging.getLogger(__name__)
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    parser = HfArgumentParser((Arguments, TrainingArguments))
-    args, training_args = parser.parse_args_into_dataclasses()
-
-    max_train, max_test = args.max_train_samples, args.max_test_samples
-    if args.smoke_test:
-        training_args.max_steps = 3
-        max_train, max_test = 10, 10
-
-    bench = Benchmark()
-    track = bench.track
-    with track('Total Run'):
-        ############################ Load Data ####################################
-        with track('Load Data'):
-            data = load_dataset(*check_dataset(args.dataset))
-            train_all = data['train']
-            test_split = 'validation' if args.dataset == 'sst2' else 'test'
-            len_train = len(train_all)
-            train_data = train_all.select(range(len_train - max_train, len_train)) if max_train else train_all
-
-            # split the Test Data for multi-instance
-            if args.multi_instance:
-                start_index = (args.instance_index - 1) * args.max_test_samples
-                end_index = args.instance_index * args.max_test_samples
-                test_data = data[test_split].select(range(start_index, end_index))
-                print("start_index is ", start_index)
-                print("end_index is ", end_index)
-                print("test length is ", len(test_data))
-            else:
-                test_data = data[test_split].select(range(max_test)) if max_test else data[test_split]
-
-            text_column = [c for c in test_data.column_names if type(test_data[c][0]) != int][0]
-
-        ############################### Pre-process ###############################
-        with track('Pre-process'):
-            with track('----Init tokenizer'):
-                tokenizer = AutoTokenizer.from_pretrained(
-                    args.tokenizer_name if args.tokenizer_name else args.model_name_or_path
-                )
-
-            max_seq_len = min(args.max_seq_len, tokenizer.model_max_length)
-
-            with track('----Tokenize + Extract Features'):
-                def preprocess(examples):
-                    return tokenizer(
-                        examples[text_column],
-                        padding='max_length',
-                        truncation=True,
-                        max_length=max_seq_len
-                    )
-
-                kwargs = dict(
-                    function=preprocess,
-                    batched=True,
-                    num_proc=args.preprocessing_num_workers,
-                    remove_columns=[text_column] + (['idx'] if args.dataset == 'sst2' else []),
-                    load_from_cache_file=not args.overwrite_cache)
-
-                train_data = train_data.map(**kwargs) if training_args.do_train else None
-                test_data = test_data.map(**kwargs) if training_args.do_predict else None
-                
-        ###################### Load Model and Trainer ############################
-        with track('Load Model'):
-            model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_path)
-
-            trainer = Trainer(
-                model=model,  # the instantiated HF model to be trained
-                args=training_args,  # training arguments, defined above
-                train_dataset=train_data,  # training dataset
-                compute_metrics=compute_metrics,  # evaluation metrics
-                tokenizer=tokenizer
-            )
-
-        ############################### Fine-Tune #################################
-        if training_args.do_train:
-            with track('Fine-Tune'):
-                train_result = trainer.train()
-                trainer.save_model()
-                save_train_metrics(train_result, trainer, len(train_data))
-
-        ############################### Inference #################################
-        test_metrics = ""
-        if training_args.do_predict:
-            with track('Inference'):
-                preds, _, metrics = trainer.predict(test_data)
-                test_metrics = save_test_metrics(metrics, len(test_data), training_args.output_dir)
-
-    bench.summary()
-    print(test_metrics)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/profiling-transformers/src/run_pt_native.py b/profiling-transformers/src/run_pt_native.py
deleted file mode 100644
index 108abaf..0000000
--- a/profiling-transformers/src/run_pt_native.py
+++ /dev/null
@@ -1,249 +0,0 @@
-# Copyright (C) 2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-
-#
-
-from pathlib import Path
-import os
-import logging
-from tqdm import tqdm
-
-import numpy as np
-import torch
-from torch.utils.data import DataLoader
-from torch import tensor
-
-try:
-    import intel_extension_for_pytorch as ipex
-finally:
-    pass
-
-import transformers
-from transformers import (
-    HfArgumentParser,
-    AutoTokenizer,
-    AutoModelForSequenceClassification,
-    TrainingArguments,
-)
-
-from utils import (
-    Arguments,
-    read_dataset,
-    to_tensor_dataset,
-    Benchmark,
-    compute_metrics,
-    PredsLabels
-)
-
-transformers.logging.set_verbosity_info()
-
-logger = logging.getLogger(__name__)
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-
-    parser = HfArgumentParser((Arguments, TrainingArguments))
-    args, training_args = parser.parse_args_into_dataclasses()
-    output_dir = Path(training_args.output_dir)
-    os.makedirs(output_dir, exist_ok=True)
-    bench = Benchmark()
-    track = bench.track
-
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
-    def to_inputs(batch: dict) -> dict:
-        return {k: (v if torch.is_tensor(v) else tensor(v)).to(device=device) \
-                for k, v in batch.items()}
-
-    ################################# Load Data #################################
-
-    with track('Load Data'):
-        if training_args.do_train:
-            # Train Data
-            train_texts, train_labels = read_dataset(args.dataset, 'train')
-            max_train = args.max_train_samples if args.max_train_samples else len(train_texts)
-            if args.smoke_test:
-                training_args.max_steps = 3
-                training_args.num_train_epochs = 1
-                max_train = 104
-            train_texts, train_labels = train_texts[:max_train], train_labels[:max_train]
-
-        if training_args.do_predict:
-            max_test = 100 if args.smoke_test else (args.max_test_samples if args.max_test_samples else None)
-
-            if not args.real_time:
-                # Test Data
-                test_texts, test_labels = read_dataset(args.dataset, 'test')
-                if args.multi_instance:
-                    start_index = (args.instance_index - 1) * args.max_test_samples
-                    end_index = args.instance_index * args.max_test_samples
-                    test_texts, test_labels = test_texts[start_index:end_index], test_labels[start_index:end_index]
-                    print("start_index is ", start_index)
-                    print("end_index is ", end_index)
-                    print("test text length is ", len(test_texts))
-                    print("test labels  length is ", len(test_labels))
-                else:
-                    test_texts, test_labels = test_texts[:max_test], test_labels[:max_test]
-
-    ################################# Pre-process #################################
-    with track('Pre-process'):
-        with track('----Init tokenizer'):
-            # Tokenization + Feature Extraction
-            tokenizer = AutoTokenizer.from_pretrained(
-                args.tokenizer_name if args.tokenizer_name else args.model_name_or_path
-            )
-            max_seq_len = min(args.max_seq_len, tokenizer.model_max_length)
-            token_args = dict(truncation=True, padding=True, max_length=max_seq_len)
-
-            if training_args.do_train:
-                with track('----Training data encoding'):
-                    train_encodings = tokenizer(train_texts, **token_args)
-                with track('----Training tensor data convert'):
-                    train_dataset = to_tensor_dataset('pt', train_encodings, train_labels)
-
-            if training_args.do_predict and not args.real_time:
-                with track('----PyTorch test data encoding'):
-                    test_encodings = tokenizer(test_texts, padding='max_length', max_length=max_seq_len,
-                                               truncation=True)
-                with track('----PyTorch test tensor data convert'):
-                    test_dataset = to_tensor_dataset('pt', test_encodings, test_labels)
-
-    ################################# Load Model #################################
-    if training_args.do_train or not args.torchscript:
-        with track('Load Model'):
-            if args.bf16_ipex_ft:
-                with torch.cpu.amp.autocast():
-                    model = AutoModelForSequenceClassification \
-                        .from_pretrained(args.model_name_or_path) \
-                        .to(device=device)
-                    model = ipex.optimize(model, dtype=torch.bfloat16, level='O0')
-            else:
-                model = AutoModelForSequenceClassification \
-                    .from_pretrained(args.model_name_or_path) \
-                    .to(device=device)
-
-            if args.fp32_ipex_ft:
-                model = ipex.optimize(model, dtype=torch.float32, level='O1')
-
-        with track("Process int8 model"):
-            if args.int8:
-                # convert fp32 model to int8
-                ipex.nn.utils._model_convert.replace_dropout_with_identity(model)
-                conf = ipex.quantization.QuantConf(configure_file=args.model_name_or_path + "/configure.json")
-                dumpy_tensor = torch.ones((training_args.per_device_eval_batch_size, max_seq_len), dtype=torch.long)
-                jit_inputs = (dumpy_tensor, dumpy_tensor, dumpy_tensor)
-                if args.int8_bf16:
-                    with torch.cpu.amp.autocast():
-                        model = ipex.quantization.convert(model, conf, jit_inputs)
-                else:
-                    model = ipex.quantization.convert(model, conf, jit_inputs)
-                with torch.no_grad():
-                    y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor)
-                    y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor)
-
-        with track("Process bf16 model"):
-            if args.ipex_bf16:
-                # convert fp32 model to bf16
-                with torch.cpu.amp.autocast(), torch.no_grad():
-                    torch.jit.load('imdb_bf16model.pt')
-                model = ipex.optimize(model, dtype=torch.bfloat16, level='O0')
-                dumpy_tensor = torch.ones((training_args.per_device_eval_batch_size, max_seq_len), dtype=torch.long)
-                jit_inputs = (dumpy_tensor, dumpy_tensor, dumpy_tensor)
-                with torch.cpu.amp.autocast(), torch.no_grad():
-                    model = torch.jit.trace(model, jit_inputs, strict=False)
-                    model = torch.jit.freeze(model)
-                with torch.no_grad():
-                    y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor)
-                    y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor)
-
-    ################################ Fine-Tune #################################
-    if training_args.do_train:
-        with track('Fine-Tune'):
-            with track('--------Init Fine-Tuning'):
-                batch_size = training_args.per_device_train_batch_size
-                model.train()
-                weight_decay = 0.0
-                no_decay = ["bias", "LayerNorm.weight"]
-                optimizer_grouped_parameters = [
-                    {
-                        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-                        "weight_decay": weight_decay,
-                    },
-                    {
-                        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
-                        "weight_decay": 0.0,
-                    },
-                ]
-                optim = torch.optim.AdamW(optimizer_grouped_parameters, lr=training_args.learning_rate)
-
-            with track('--------Training Loop'):
-                for _ in tqdm(range(int(training_args.num_train_epochs)), desc='Epoch'):
-                    for batch in tqdm(DataLoader(train_dataset, batch_size=batch_size, shuffle=True),
-                                      desc='Train Step'):
-                        optim.zero_grad()
-                        loss = model(**to_inputs(batch))[0]
-                        loss.backward()
-                        optim.step()
-
-            with track('--------Save Fine-Tuned Model'):
-                if args.torchscript:
-                    with track('--------Save TorchScript model'):
-                        model.eval()
-                        batch = to_inputs(batch)
-                        traced_model = torch.jit.trace(model, [batch['input_ids'], batch['attention_mask']])
-                        torch.jit.save(traced_model, output_dir / "traced_model.pt")
-                else:
-                    torch.save(model.state_dict(), output_dir / "pytorch_model.bin")
-
-    ############################### Inference #################################
-    if training_args.do_predict:
-        with track('Inference'):
-            if args.torchscript:
-                with track('--------Load TorchScript model'):
-                    model_path = output_dir if training_args.do_train else Path(args.model_name_or_path)
-                    model = torch.jit.load(model_path / "traced_model.pt").to(device=device)
-
-            batch_size = training_args.per_device_eval_batch_size
-            all_outputs, all_labels = [], []
-
-            def prediction_step(batch, labels):
-                all_labels.extend(labels)
-                inputs = to_inputs(batch)
-                output = model(inputs['input_ids'], inputs['attention_mask']) if args.torchscript \
-                    else model(**inputs)
-                all_outputs.append(output['logits'].detach().cpu())
-
-            model.eval()
-            with torch.no_grad():
-                if args.real_time:
-                    data_generator = read_dataset(args.dataset, 'test', generator=True, \
-                                                  batch_size=batch_size, max_samples=max_test)
-
-                    for texts, labels in tqdm(data_generator, desc='Test Step'):
-                        prediction_step(batch=tokenizer(texts, **token_args), labels=labels)
-
-                else:
-                    for batch in tqdm(DataLoader(test_dataset, batch_size=batch_size), desc='Test Step'):
-                        prediction_step(batch=batch, labels=batch.pop('labels'))
-                    acc = compute_metrics(PredsLabels(preds=np.concatenate(all_outputs), labels=all_labels))
-                    print(f"\n*********** TEST_METRICS ***********\nAccuracy: {acc['acc']}\n")
-
-    bench.summary()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/profiling-transformers/src/run_pt_native_ft.py b/profiling-transformers/src/run_pt_native_ft.py
deleted file mode 100644
index 1040ced..0000000
--- a/profiling-transformers/src/run_pt_native_ft.py
+++ /dev/null
@@ -1,286 +0,0 @@
-# Copyright (C) 2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-
-#
-
-from pathlib import Path
-import os
-import logging
-from tqdm import tqdm
-
-import numpy as np
-import torch
-from torch.utils.data import DataLoader, RandomSampler
-from torch.utils.data.distributed import DistributedSampler
-from torch import tensor
-
-try:
-    import intel_extension_for_pytorch as ipex
-finally:
-    pass
-
-import transformers
-from transformers import (
-    HfArgumentParser,
-    AutoTokenizer,
-    AutoModelForSequenceClassification,
-    TrainingArguments,
-    set_seed,
-)
-
-from utils import (
-    Arguments,
-    read_dataset,
-    to_tensor_dataset,
-    Benchmark,
-    compute_metrics,
-    PredsLabels
-)
-
-transformers.logging.set_verbosity_info()
-
-logger = logging.getLogger(__name__)
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-
-    parser = HfArgumentParser((Arguments, TrainingArguments))
-    args, training_args = parser.parse_args_into_dataclasses()
-    output_dir = Path(training_args.output_dir)
-    os.makedirs(output_dir, exist_ok=True)
-    bench = Benchmark()
-    track = bench.track
-
-    set_seed(training_args.seed)
-
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
-    if int(os.environ.get('PMI_SIZE', '0')) > 1 and not args.multi_instance:
-        if args.dist_backend == 'ccl':
-            try:
-                import oneccl_bindings_for_pytorch
-            except:
-                print("CCL backend requested but import oneccl_bindings_for_pytorch failed")
-                raise
-        elif args.dist_backend == 'mpi':
-            if not torch.distributed.is_mpi_available():
-                try:
-                    import torch_mpi
-                except:
-                    print("MPI backend requested but not available try installing torch_mpi module")
-                    raise
-        else:
-            raise ValueError(f"{args.dist_backend} backend requested but not supported")
-
-        os.environ['RANK'] = os.environ.get('PMI_RANK', '0')
-        os.environ['WORLD_SIZE'] = os.environ.get('PMI_SIZE', '1')
-        torch.distributed.init_process_group(backend=args.dist_backend)
-        device = torch.device("cpu")
-        training_args.local_rank = torch.distributed.get_rank()
-        if training_args.local_rank == 0: print(f"##################Using {args.dist_backend.upper()} dist run with {torch.distributed.get_world_size()} ranks", flush=True)
-
-    def to_inputs(batch: dict) -> dict:
-        return {k: (v if torch.is_tensor(v) else tensor(v)).to(device=device) \
-                for k, v in batch.items()}
-
-    ################################# Load Data #################################
-
-    with track('Load Data'):
-        if training_args.do_train:
-            # Train Data
-            train_texts, train_labels = read_dataset(args.dataset, 'train')
-            max_train = args.max_train_samples if args.max_train_samples else len(train_texts)
-            if args.smoke_test:
-                training_args.max_steps = 3
-                training_args.num_train_epochs = 1
-                max_train = 104
-            train_texts, train_labels = train_texts[:max_train], train_labels[:max_train]
-
-        if training_args.do_predict:
-            max_test = 100 if args.smoke_test else (args.max_test_samples if args.max_test_samples else None)
-
-            if not args.real_time:
-                # Test Data
-                test_texts, test_labels = read_dataset(args.dataset, 'test')
-                if args.multi_instance:
-                    start_index = (args.instance_index - 1) * args.max_test_samples
-                    end_index = args.instance_index * args.max_test_samples
-                    test_texts, test_labels = test_texts[start_index:end_index], test_labels[start_index:end_index]
-                    print("start_index is ", start_index)
-                    print("end_index is ", end_index)
-                    print("test text length is ", len(test_texts))
-                    print("test labels  length is ", len(test_labels))
-                else:
-                    test_texts, test_labels = test_texts[:max_test], test_labels[:max_test]
-
-    ################################# Pre-process #################################
-    with track('Pre-process'):
-        with track('----Init tokenizer'):
-            # Tokenization + Feature Extraction
-            tokenizer = AutoTokenizer.from_pretrained(
-                args.tokenizer_name if args.tokenizer_name else args.model_name_or_path
-            )
-            max_seq_len = min(args.max_seq_len, tokenizer.model_max_length)
-            token_args = dict(truncation=True, padding=True, max_length=max_seq_len)
-
-            if training_args.do_train:
-                with track('----Training data encoding'):
-                    train_encodings = tokenizer(train_texts, **token_args)
-                with track('----Training tensor data convert'):
-                    train_dataset = to_tensor_dataset('pt', train_encodings, train_labels)
-
-            if training_args.do_predict and not args.real_time:
-                with track('----PyTorch test data encoding'):
-                    test_encodings = tokenizer(test_texts, padding='max_length', max_length=max_seq_len,
-                                               truncation=True)
-                with track('----PyTorch test tensor data convert'):
-                    test_dataset = to_tensor_dataset('pt', test_encodings, test_labels)
-
-    ################################# Load Model #################################
-    if training_args.do_train or not args.torchscript:
-        with track('Load Model'):
-            if args.bf16_ipex_ft:
-                with torch.cpu.amp.autocast():
-                    model = AutoModelForSequenceClassification \
-                            .from_pretrained(args.model_name_or_path) \
-                            .to(device=device)
-                    model = ipex.optimize(model, dtype=torch.bfloat16, level='O0')
-            else:
-                model = AutoModelForSequenceClassification \
-                        .from_pretrained(args.model_name_or_path) \
-                        .to(device=device)    
-            #model = AutoModelForSequenceClassification \
-            #    .from_pretrained(args.model_name_or_path) \
-            #    .to(device=device)
-
-        with track("Process int8 model"):
-            if args.int8:
-                # convert fp32 model to int8
-                ipex.nn.utils._model_convert.replace_dropout_with_identity(model)
-                conf = ipex.quantization.QuantConf(configure_file=args.model_name_or_path + "/configure.json")
-                dumpy_tensor = torch.ones((training_args.per_device_eval_batch_size, max_seq_len), dtype=torch.long)
-                jit_inputs = (dumpy_tensor, dumpy_tensor, dumpy_tensor)
-                if args.int8_bf16:
-                    with torch.cpu.amp.autocast():
-                        model = ipex.quantization.convert(model, conf, jit_inputs)
-                else:
-                    model = ipex.quantization.convert(model, conf, jit_inputs)
-                with torch.no_grad():
-                    y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor)
-                    y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor)
-
-        with track("Process bf16 model"):
-            if args.ipex_bf16:
-                # convert fp32 model to bf16
-                with torch.cpu.amp.autocast(), torch.no_grad():
-                    torch.jit.load('imdb_bf16model.pt')
-                model = ipex.optimize(model, dtype=torch.bfloat16, level='O0')
-                dumpy_tensor = torch.ones((training_args.per_device_eval_batch_size, max_seq_len), dtype=torch.long)
-                jit_inputs = (dumpy_tensor, dumpy_tensor, dumpy_tensor)
-                with torch.cpu.amp.autocast(), torch.no_grad():
-                    model = torch.jit.trace(model, jit_inputs, strict=False)
-                    model = torch.jit.freeze(model)
-                with torch.no_grad():
-                    y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor)
-                    y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor)
-
-    ################################ Fine-Tune #################################
-    if training_args.do_train:
-        with track('Fine-Tune'):
-            with track('--------Init Fine-Tuning'):
-                batch_size = training_args.per_device_train_batch_size
-                model.train()
-                weight_decay = 0.0
-                no_decay = ["bias", "LayerNorm.weight"]
-                optimizer_grouped_parameters = [
-                    {
-                        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-                        "weight_decay": weight_decay,
-                    },
-                    {
-                        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
-                        "weight_decay": 0.0,
-                    },
-                ]
-                optim = torch.optim.AdamW(optimizer_grouped_parameters, lr=training_args.learning_rate)
-                if training_args.local_rank != -1:
-                   model = torch.nn.parallel.DistributedDataParallel(model)
-
-            with track('--------Training Loop'):
-                train_sampler = RandomSampler(train_dataset) if training_args.local_rank == -1 else DistributedSampler(train_dataset)
-
-                for _ in tqdm(range(int(training_args.num_train_epochs)), desc='Epoch', disable=training_args.local_rank not in [-1, 0]):
-                    for batch in tqdm(DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size),
-                                      desc='Train Step', disable=training_args.local_rank not in [-1, 0]):
-                        optim.zero_grad()
-                        loss = model(**to_inputs(batch))[0]
-                        loss.backward()
-                        optim.step()
-
-            with track('--------Save Fine-Tuned Model'):
-                if training_args.local_rank in [-1, 0]:
-                    # Take care of DDP wrapper
-                    model_to_save = model.module if hasattr(model, "module") else model
-                    if args.torchscript:
-                        with track('--------Save TorchScript model'):
-                            model.eval()
-                            batch = to_inputs(batch)
-                            traced_model = torch.jit.trace(model_to_save, [batch['input_ids'], batch['attention_mask']])
-                            torch.jit.save(traced_model, output_dir / "traced_model.pt")
-                    else:
-                        torch.save(model_to_save.state_dict(), output_dir / "pytorch_model.bin")
-
-    ############################### Inference #################################
-    if training_args.do_predict:
-        with track('Inference'):
-            if args.torchscript:
-                with track('--------Load TorchScript model'):
-                    model_path = output_dir if training_args.do_train else Path(args.model_name_or_path)
-                    model = torch.jit.load(model_path / "traced_model.pt").to(device=device)
-
-            batch_size = training_args.per_device_eval_batch_size
-            all_outputs, all_labels = [], []
-
-            def prediction_step(batch, labels):
-                all_labels.extend(labels)
-                inputs = to_inputs(batch)
-                output = model(inputs['input_ids'], inputs['attention_mask']) if args.torchscript \
-                    else model(**inputs)
-                all_outputs.append(output['logits'].detach().cpu())
-
-            model.eval()
-            with torch.no_grad():
-                if args.real_time:
-                    data_generator = read_dataset(args.dataset, 'test', generator=True, \
-                                                  batch_size=batch_size, max_samples=max_test)
-
-                    for texts, labels in tqdm(data_generator, desc='Test Step'):
-                        prediction_step(batch=tokenizer(texts, **token_args), labels=labels)
-
-                else:
-                    test_sampler = RandomSampler(test_dataset) if training_args.local_rank == -1 else DistributedSampler(test_dataset)
-
-                    for batch in tqdm(DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size), desc='Test Step'):
-                        prediction_step(batch=batch, labels=batch.pop('labels'))
-                    acc = compute_metrics(PredsLabels(preds=np.concatenate(all_outputs), labels=all_labels))
-                    print(f"\n*********** TEST_METRICS ***********\nAccuracy: {acc['acc']}\n")
-
-    bench.summary()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/profiling-transformers/src/run_pt_native_inf.py b/profiling-transformers/src/run_pt_native_inf.py
deleted file mode 100644
index 935906f..0000000
--- a/profiling-transformers/src/run_pt_native_inf.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Copyright (C) 2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-
-#
-
-import logging
-import os
-
-import numpy as np
-import torch
-from datasets import load_dataset
-from torch import tensor
-from torch.utils.data import DataLoader
-from tqdm import tqdm
-
-try:
-    import intel_extension_for_pytorch as ipex
-finally:
-    pass
-
-from transformers import (
-    logging as hf_logging,
-    HfArgumentParser,
-    AutoTokenizer,
-    AutoModelForSequenceClassification,
-    TrainingArguments,
-    DataCollatorWithPadding
-)
-
-from utils import (
-    Arguments,
-    Benchmark,
-    compute_metrics,
-    PredsLabels,
-    check_dataset
-)
-
-hf_logging.set_verbosity_info()
-logger = logging.getLogger(__name__)
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    parser = HfArgumentParser((Arguments, TrainingArguments))
-    args, training_args = parser.parse_args_into_dataclasses()
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
-    max_train, max_test = args.max_train_samples, args.max_test_samples
-    if args.smoke_test:
-        training_args.max_steps = 3
-        max_train, max_test = 10, 10
-
-    bench = Benchmark()
-    track = bench.track
-    with track('Total Run'):
-        ############################ Load Data ####################################
-        with track('Load Data'):
-            data = load_dataset(*check_dataset(args.dataset))
-            train_all = data['train']
-            test_split = 'validation' if args.dataset == 'sst2' else 'test'
-            len_train = len(train_all)
-            train_data = train_all.select(range(len_train - max_train, len_train)) if max_train else train_all
-
-            # split the Test Data for multi-instance
-            if args.multi_instance:
-                start_index = (args.instance_index - 1) * args.max_test_samples
-                end_index = args.instance_index * args.max_test_samples
-                test_data = data[test_split].select(range(start_index, end_index))
-                print("start_index is ", start_index)
-                print("end_index is ", end_index)
-                print("test length is ", len(test_data))
-            else:
-                test_data = data[test_split].select(range(max_test)) if max_test else data[test_split]
-
-            text_column = [c for c in test_data.column_names if type(test_data[c][0]) != int][0]
-
-        ############################### Pre-process ###############################
-        with track('Pre-process'):
-            with track('----Init tokenizer'):
-                tokenizer = AutoTokenizer.from_pretrained(
-                    args.tokenizer_name if args.tokenizer_name else args.model_name_or_path
-                )
-
-            max_seq_len = min(args.max_seq_len, tokenizer.model_max_length)
-
-            with track('----Tokenize + Extract Features'):
-                def preprocess(examples):
-                    return tokenizer(
-                        examples[text_column],
-                        padding='max_length',
-                        truncation=True,
-                        max_length=max_seq_len
-                    )
-
-                kwargs = dict(
-                    function=preprocess,
-                    batched=True,
-                    num_proc=args.preprocessing_num_workers,
-                    remove_columns=[text_column] + (['idx'] if args.dataset == 'sst2' else []),
-                    load_from_cache_file=not args.overwrite_cache)
-
-                train_data = train_data.map(**kwargs) if training_args.do_train else None
-                test_data = test_data.map(**kwargs) if training_args.do_predict else None
-
-        ###################### Load Model and Trainer ############################
-        with track('Load Model'):
-            model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_path).to(device=device)
-
-        with track("Process int8 model"):
-            if args.int8:
-                # convert fp32 model to int 8
-                dumpy_tensor = torch.ones((training_args.per_device_eval_batch_size, max_seq_len), dtype=torch.long)
-                jit_inputs = (dumpy_tensor, dumpy_tensor, dumpy_tensor)
-
-                if os.path.exists(args.model_name_or_path + "/quantized_model.pt"):
-                    print("load int8 model-----------------------")
-                    if args.int8_bf16:
-                        with torch.cpu.amp.autocast():
-                            model = torch.jit.load(args.model_name_or_path + "/quantized_model.pt")
-                            model = torch.jit.freeze(model.eval())
-                    else:
-                        model = torch.jit.load(args.model_name_or_path + "/quantized_model.pt")
-                        model = torch.jit.freeze(model.eval())
-                else:
-                    print("load configure and convert the model")
-                    ipex.nn.utils._model_convert.replace_dropout_with_identity(model)
-                    from intel_extension_for_pytorch.quantization import prepare, convert
-                    from torch.ao.quantization import MinMaxObserver, PerChannelMinMaxObserver, QConfig
-                    qconfig = QConfig(activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_affine, dtype=torch.quint8), weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_channel_symmetric))
-                    prepared_model = prepare(model, qconfig, example_inputs=jit_inputs, inplace=False)
-                    prepared_model.load_qconf_summary(qconf_summary = args.model_name_or_path + "/int8_configure.json")
-                    if args.int8_bf16:
-                        with torch.cpu.amp.autocast():
-                            model = convert(prepared_model)
-                            model = torch.jit.trace(model, jit_inputs, strict=False)
-                    else:
-                        model = convert(prepared_model)
-                        model = torch.jit.trace(model, jit_inputs, strict=False)
-                    model = torch.jit.freeze(model)
-
-
-                with torch.no_grad():
-                    y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor)
-                    y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor)
-
-    #            model.save("quantized_model.pt")
-    #            import sys
-    #            sys.exit(0)
-
-        with track("Process bf16 model"):
-            if args.ipex_bf16:
-                model = ipex.optimize(model, dtype=torch.bfloat16, level='O0')
-                dumpy_tensor = torch.ones((training_args.per_device_eval_batch_size, max_seq_len), dtype=torch.long)
-                jit_inputs = (dumpy_tensor, dumpy_tensor, dumpy_tensor)
-                with torch.cpu.amp.autocast(), torch.no_grad():
-                    model = torch.jit.trace(model, jit_inputs, strict=False)
-                    model = torch.jit.freeze(model)
-                with torch.no_grad():
-                    y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor)
-                    y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor)
-
-        if args.ipex_fp32:
-            model = ipex.optimize(model, dtype=torch.float32, level='O1')
-
-        ############################### Inference #################################
-        if training_args.do_predict:
-            with track('Inference'):
-                batch_size = training_args.per_device_eval_batch_size
-                all_outputs, all_labels = [], []
-
-                device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
-                def to_inputs(batch: dict) -> dict:
-                    return {k: (v if torch.is_tensor(v) else tensor(v)).to(device=device) for k, v in batch.items()}
-
-                def prediction_step(batch, labels):
-                    all_labels.extend(labels)
-                    inputs = to_inputs(batch)
-                    output = model(inputs['input_ids'], inputs['attention_mask']) if args.torchscript \
-                        else model(**inputs)
-                    all_outputs.append(output['logits'].detach().cpu())
-
-                model.eval()
-
-                with torch.no_grad():
-                    if args.profiler:
-                        with torch.profiler.profile(
-                            schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=2),
-                            on_trace_ready=torch.profiler.tensorboard_trace_handler('./profiler/' + args.profiler_name),
-                            record_shapes=True,
-                            profile_memory=True,
-                            with_stack=True
-                            ) as prof:
-                            for batch in tqdm(DataLoader(test_data, batch_size=batch_size,
-                                                         collate_fn=DataCollatorWithPadding(tokenizer))):
-                                prediction_step(batch=batch, labels=batch.pop('labels'))
-                                prof.step()
-                    else:
-                        for batch in tqdm(DataLoader(test_data, batch_size=batch_size,
-                                                     collate_fn=DataCollatorWithPadding(tokenizer))):
-                            prediction_step(batch=batch, labels=batch.pop('labels'))
-
-                    acc = compute_metrics(PredsLabels(preds=np.concatenate(all_outputs), labels=all_labels))
-                    print(f"\n*********** TEST_METRICS ***********\nAccuracy: {acc['acc']}\n")
-
-    bench.summary()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/profiling-transformers/src/utils.py b/profiling-transformers/src/utils.py
index 85ff24c..35b1c8b 100644
--- a/profiling-transformers/src/utils.py
+++ b/profiling-transformers/src/utils.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2022 Intel Corporation
+# Copyright (C) 2022 Intel Corporation                                                                                              
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,28 +13,17 @@
 # and limitations under the License.
 #
 
-#
-
 import json
+from contextlib import contextmanager
 from dataclasses import dataclass, field
-from typing import Optional
 from pathlib import Path
-import numpy as np
 from time import perf_counter_ns
-from dataclasses import dataclass, field
+from typing import Optional
+
 import numpy as np
-from contextlib import contextmanager
-import os
 
 SEC_TO_NS_SCALE = 1000000000
 
-SPLIT_PATHS = {
-    ('imdb', 'train'): './datasets/aclImdb/train',
-    ('imdb', 'test'): './datasets/aclImdb/test',
-    ('sst2', 'train'): './datasets/sst/train.tsv',
-    ('sst2', 'test'): './datasets/sst/dev.tsv'
-}
-
 
 @dataclass
 class Benchmark:
@@ -109,7 +98,7 @@ class Arguments:
     profiler: int = field(
         default=0,
         metadata={
-            "help": "wether using pytorch profiler"
+            "help": "whether using pytorch profiler"
         },
     )
     profiler_name: str = field(
@@ -118,87 +107,60 @@ class Arguments:
             "help": "log name for pytorch profiler"
         },
     )
-    ipex: bool = field(
-        default=False,
-        metadata={
-            "help": "Use Intel® Extension for PyTorch for fine-Tuning."
-        },
-    )
-    ipex_bf16: int = field(
-        default=0,
-        metadata={
-            "help": "Auto mixed precision using bfloat16."
-        },
-    )
-    ipex_fp32: int = field(
-        default=0,
-        metadata={
-            "help": "Auto mixed precision using bfloat16."
-        },
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
     )
-    bf16_ipex_ft: int = field(
-        default=False,
-        metadata={
-            "help": "Auto mixed precision using bfloat16 to fine-tuning."
-        },
+    overwrite_cache: bool = field(
+        default=True, metadata={"help": "Overwrite the cached training and evaluation sets."}
     )
-    fp32_ipex_ft: int = field(
+    use_tpp: bool = field(
         default=False,
         metadata={
-            "help": "use ipex optimization for fp32 fine-tuning."
-        },
-    )
-    int8_bf16: int = field(
-        default=0,
-        metadata={
-            "help": "Auto mixed precision using int8+bfloat16."
+            "help": "Use TPP Extension for PyTorch for fine-Tuning."
         },
     )
-    multi_instance: bool = field(
+    unpad: bool = field(
         default=False,
         metadata={
-            "help": "Whether to use multi-instance mode"
-        },
-    )
-    int8: int = field(
-        default=0,
-        metadata={
-            "help": "Whether to do inference with int8 model"
+            "help": "Use TPP Extension for PyTorch for fine-Tuning."
         },
     )
-    dist_backend: Optional[str] = field(
-        default="ccl", metadata={"help": "Distributed backend to use"}
+    """
+    Arguments for test scenarios
+    """
+    infer_impl: Optional[str] = field(
+        default="trainer", metadata={
+            "help": "The implementation of inference pipeline. Now we support trainer and ipex implementation."
+        }
     )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
+
+    finetune_impl: Optional[str] = field(
+        default="trainer", metadata={
+            "help": "The implementation of fine-tuning pipeline. Now we support trainer and ipex implementation."
+        }
     )
-    overwrite_cache: bool = field(
-        default=True, metadata={"help": "Overwrite the cached training and evaluation sets."}
+
+    dtype_inf: Optional[str] = field(
+        default="fp32", metadata={
+            "help": "Data type for inference pipeline."
+                    "Support fp32, bf16, and int8 for CPU. Support fp32 and fp16 for GPU "
+        }
     )
-    real_time: bool = field(
-        default=False, metadata={"help": "Whether to pre-process the inputs in real-time."}
+    dtype_ft: Optional[str] = field(
+        default="fp32", metadata={
+            "help": "Data type for inference pipeline."
+                    "Support fp32 and bf16 for CPU. Support fp32, tf32, and fp16 for GPU "
+        }
     )
-    few_shot: bool = field(
+    multi_instance: bool = field(
         default=False,
         metadata={
-            "help": "Employ few-shot pattern-based MLM training on a small subset of the data."
+            "help": "Whether to use multi-instance mode"
         },
     )
-    pattern_id: bool = field(
-        default=0, metadata={"help": "Few-shot: pattern id of the pattern to use for few-shot training."}
-    )
-    label_loss: bool = field(
-        default=True, metadata={"help": "Few-shot: whether to use label loss."}
-    )
-    random_mlm: bool = field(
-        default=False, metadata={"help": "Few-shot: whether to use random MLM loss."}
-    )
-    alpha: float = field(
-        default=0.6, metadata={"help": "Few-shot: alpha value for loss computation: ."}
-    )
-    torchscript: bool = field(
-        default=False, metadata={"help": "Enable Torchscript."}
+    dist_backend: Optional[str] = field(
+        default="ccl", metadata={"help": "Distributed backend to use for fine-tuning"}
     )
 
 
@@ -213,100 +175,6 @@ def compute_metrics(p):
     return {"acc": (preds == p.label_ids).mean()}
 
 
-def check_dataset(name: str):
-    if name == 'imdb':
-        return [name]
-    elif name == 'sst2':
-        return ['glue', 'sst2']
-    else:
-        error_msg = f'Now only imdb and sst2 dataset are supported. Your dataset is {name}.'
-        raise ValueError(error_msg)
-
-
-def read_dataset(name: str, split: str = "test", generator: bool = False,
-                 return_labels: bool = True, batch_size: int = 1, max_samples: int = None):
-    split_path = SPLIT_PATHS[(name, split)]
-    args = split_path, return_labels, batch_size, max_samples
-    gen = imdb_gen(*args) if name == 'imdb' else sst_gen(*args)
-
-    if generator:
-        return gen
-
-    texts, labels = [], []
-    for text_batch, label_batch in gen:
-        texts.extend(text_batch)
-        if return_labels:
-            labels.extend(label_batch)
-    return (texts, labels) if return_labels else texts
-
-
-def imdb_gen(split_path, return_label, batch_size, max_samples):
-    text_batch, label_batch = [], []
-    for label_dir in "pos", "neg":
-        for i, text_file in enumerate((Path(split_path) / label_dir).iterdir()):
-            text_batch.append(text_file.read_text())
-            if return_label:
-                label_batch.append(0 if label_dir == 'neg' else 1)
-            if len(text_batch) == batch_size:
-                yield (text_batch, label_batch) if return_label else text_batch
-                text_batch, label_batch = [], []
-            if max_samples is not None and i == max_samples / 2:
-                break
-    if text_batch:
-        yield (text_batch, label_batch) if return_label else text_batch
-        text_batch, label_batch = [], []
-
-
-def sst_gen(split_path, return_label, batch_size, max_samples):
-    text_batch, label_batch = [], []
-    i = 0
-    with open(split_path) as f:
-        for line in f.readlines()[1:]:
-            if line:
-                i += 1
-                text, label = line.strip().split(" \t")
-                text_batch.append(text)
-                if return_label:
-                    label_batch.append(int(label))
-            if len(text_batch) == batch_size:
-                yield (text_batch, label_batch) if return_label else text_batch
-                text_batch, label_batch = [], []
-            if max_samples is not None and i == max_samples:
-                break
-    if text_batch:
-        yield (text_batch, label_batch) if return_label else text_batch
-        text_batch, label_batch = [], []
-
-
-def to_tensor_dataset(framework, encodings, labels=None):
-    if framework == 'tf':
-        from tensorflow.data import Dataset
-
-        data = (dict(encodings), labels) if labels else dict(encodings)
-        dataset = Dataset.from_tensor_slices(data)
-
-    if framework == 'pt':
-        from torch import tensor
-        from torch.utils.data import Dataset
-
-        class IMDbDataset(Dataset):
-            def __init__(self, encodings, labels):
-                self.encodings = encodings
-                self.labels = labels
-
-            def __getitem__(self, idx):
-                item = {key: tensor(val[idx]) for key, val in self.encodings.items()}
-                item['labels'] = tensor(self.labels[idx])
-                return item
-
-            def __len__(self):
-                return len(self.labels)
-
-        dataset = IMDbDataset(encodings, labels)
-
-    return dataset
-
-
 def save_train_metrics(train_result, trainer, max_train):
     # pytorch only
     if train_result:
@@ -321,23 +189,3 @@ def save_test_metrics(metrics, max_test, output_dir):
     with open(Path(output_dir) / 'test_results.json', 'w') as f:
         json.dump(metrics, f, indent=2)
     return "\n\n******** TEST METRICS ********\n" + '\n'.join(f'{k}: {v}' for k, v in metrics.items())
-
-
-def read_imdb_split(split_dir):
-    texts, labels = [], []
-    for label_dir in "pos", "neg":
-        for text_file in (Path(split_dir) / label_dir).iterdir():
-            texts.append(text_file.read_text())
-            labels.append(0 if label_dir == 'neg' else 1)
-    return texts, labels
-
-
-def read_sst_file(sst_file):
-    texts, labels = [], []
-    with open(sst_file) as f:
-        for line in f.readlines()[1:]:
-            if line:
-                text, label = line.strip().split(" \t")
-                texts.append(text)
-                labels.append(int(label))
-    return texts, labels