hpcaitech · flybird11111 · Jul 4, 2023 · Jul 4, 2023 · Jul 7, 2023 · Jul 7, 2023
diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
@@ -208,7 +208,7 @@ jobs:
 
       - name: Execute Unit Testing
         run: |
-          CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest --testmon --testmon-cov=. tests/
+          CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest --testmon --testmon-cov=. --durations=10 tests/
         env:
           DATA: /data/scratch/cifar-10
           NCCL_SHM_DISABLE: 1

diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml
@@ -3,7 +3,7 @@ name: Build on Schedule
 on:
   schedule:
     # run at 00:00 of every Sunday
-    - cron:  '0 0 * * *'
+    - cron: "0 0 * * *"
   workflow_dispatch:
 
 jobs:
@@ -60,7 +60,7 @@ jobs:
       - name: Unit Testing
         if: steps.check-avai.outputs.avai == 'true'
         run: |
-          PYTHONPATH=$PWD pytest tests
+          PYTHONPATH=$PWD pytest --durations=0 tests
         env:
           DATA: /data/scratch/cifar-10
           LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64

diff --git a/.github/workflows/compatiblity_test_on_dispatch.yml b/.github/workflows/compatiblity_test_on_dispatch.yml
@@ -72,7 +72,7 @@ jobs:
           ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
       - name: Download cub for CUDA 10.2
         run: |
-          CUDA_VERSION=$(cat $CUDA_HOME/version.txt | grep "CUDA Version" | awk '{print $NF}' | cut -d. -f1,2)
+          CUDA_VERSION=$(nvcc -V | awk -F ',| ' '/release/{print $6}')
 
           # check if it is CUDA 10.2
           # download cub

diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml
@@ -66,7 +66,7 @@ jobs:
           ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
       - name: Download cub for CUDA 10.2
         run: |
-          CUDA_VERSION=$(cat $CUDA_HOME/version.txt | grep "CUDA Version" | awk '{print $NF}' | cut -d. -f1,2)
+          CUDA_VERSION=$(nvcc -V | awk -F ',| ' '/release/{print $6}')
 
           # check if it is CUDA 10.2
           # download cub

diff --git a/.github/workflows/compatiblity_test_on_schedule.yml b/.github/workflows/compatiblity_test_on_schedule.yml
@@ -61,6 +61,18 @@ jobs:
         with:
           ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
 
+      - name: Download cub for CUDA 10.2
+        run: |
+          CUDA_VERSION=$(nvcc -V | awk -F ',| ' '/release/{print $6}')
+
+          # check if it is CUDA 10.2
+          # download cub
+          if [ "$CUDA_VERSION" = "10.2" ]; then
+            wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
+            unzip 1.8.0.zip
+            cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
+          fi
+
       - name: Install Colossal-AI
         run: |
           pip install -v --no-cache-dir .

diff --git a/.github/workflows/cuda_ext_check_before_merge.yml b/.github/workflows/cuda_ext_check_before_merge.yml
@@ -37,6 +37,18 @@ jobs:
       - name: Install PyTorch
         run: eval ${{ matrix.build.torch_command }}
 
+      - name: Download cub for CUDA 10.2
+        run: |
+          CUDA_VERSION=$(nvcc -V | awk -F ',| ' '/release/{print $6}')
+
+          # check if it is CUDA 10.2
+          # download cub
+          if [ "$CUDA_VERSION" = "10.2" ]; then
+            wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
+            unzip 1.8.0.zip
+            cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
+          fi
+
       - name: Build
         run: |
           CUDA_EXT=1 pip install -v .
diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml
@@ -43,7 +43,9 @@ jobs:
         run: |
           cd applications/Chat
           rm -rf ~/.cache/colossalai
-          ./examples/test_ci.sh
+          ./tests/test_inference.sh
+          ./tests/test_benchmarks.sh
+          ./tests/test_train.sh
         env:
           NCCL_SHM_DISABLE: 1
           MAX_JOBS: 8

diff --git a/README.md b/README.md
@@ -25,14 +25,15 @@
 </div>
 
 ## Latest News
+* [2023/07] [HPC-AI Tech Raises 22 Million USD in Series A Funding](https://www.hpc-ai.tech/blog/hpc-ai-tech-raises-22-million-usd-in-series-a-funding-to-fuel-team-expansion-and-business-growth)
+* [2023/07] [65B Model Pretraining Accelerated by 38%, Best Practices for Building LLaMA-Like Base Models Open-Source](https://www.hpc-ai.tech/blog/large-model-pretraining)
 * [2023/03] [ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
 * [2023/03] [Intel and Colossal-AI Partner to Deliver Cost-Efficient Open-Source Solution for Protein Folding Structure Prediction](https://www.hpc-ai.tech/blog/intel-habana)
 * [2023/03] [AWS and Google Fund Colossal-AI with Startup Cloud Programs](https://www.hpc-ai.tech/blog/aws-and-google-fund-colossal-ai-with-startup-cloud-programs)
 * [2023/02] [Open Source Solution Replicates ChatGPT Training Process! Ready to go with only 1.6GB GPU Memory](https://www.hpc-ai.tech/blog/colossal-ai-chatgpt)
 * [2023/01] [Hardware Savings Up to 46 Times for AIGC and  Automatic Parallelism](https://medium.com/pytorch/latest-colossal-ai-boasts-novel-automatic-parallelism-and-offers-savings-up-to-46x-for-stable-1453b48f3f02)
 * [2022/11] [Diffusion Pretraining and Hardware Fine-Tuning Can Be Almost 7X Cheaper](https://www.hpc-ai.tech/blog/diffusion-pretraining-and-hardware-fine-tuning-can-be-almost-7x-cheaper)
 * [2022/10] [Use a Laptop to Analyze 90% of Proteins, With a Single-GPU Inference Sequence Exceeding 10,000](https://www.hpc-ai.tech/blog/use-a-laptop-to-analyze-90-of-proteins-with-a-single-gpu-inference-sequence-exceeding)
-* [2022/09] [HPC-AI Tech Completes $6 Million Seed and Angel Round Fundraising](https://www.hpc-ai.tech/blog/hpc-ai-tech-completes-6-million-seed-and-angel-round-fundraising-led-by-bluerun-ventures-in-the)
 
 ## Table of Contents
 <ul>
@@ -49,6 +50,7 @@
  <li>
    <a href="#Parallel-Training-Demo">Parallel Training Demo</a>
    <ul>
+     <li><a href="#LLaMA">LLaMA</a></li>
      <li><a href="#GPT-3">GPT-3</a></li>
      <li><a href="#GPT-2">GPT-2</a></li>
      <li><a href="#BERT">BERT</a></li>
@@ -216,6 +218,15 @@ Acceleration of [AlphaFold Protein Structure](https://alphafold.ebi.ac.uk/)
 
 ## Parallel Training Demo
 
+### LLaMA
+<p align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/images/LLaMA_pretraining.png" width=600/>
+</p>
+
+- 65-billion-parameter large model pretraining accelerated by 38%
+[[code]](https://github.com/hpcaitech/ColossalAI/tree/example/llama/examples/language/llama)
+[[blog]](https://www.hpc-ai.tech/blog/large-model-pretraining)
+
 ### GPT-3
 <p align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/GPT3-v5.png" width=700/>
@@ -452,6 +463,7 @@ To cite this project, you can use the following BibTeX citation.
 }
 ```
 
-Colossal-AI has been accepted as official tutorial by top conferences [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/), [PPoPP](https://ppopp23.sigplan.org/), [CVPR](https://cvpr2023.thecvf.com/), [ISC](https://www.isc-hpc.com/), etc.
+Colossal-AI has been accepted as official tutorial by top conferences [NeurIPS](https://nips.cc/), [SC](https://sc22.supercomputing.org/), [AAAI](https://aaai.org/Conferences/AAAI-23/), 
+[PPoPP](https://ppopp23.sigplan.org/), [CVPR](https://cvpr2023.thecvf.com/), [ISC](https://www.isc-hpc.com/), [NVIDIA GTC](https://www.nvidia.com/en-us/on-demand/session/gtcspring23-S51482/) ,etc.
 
 <p align="right">(<a href="#top">back to top</a>)</p>
diff --git a/applications/Chat/.gitignore b/applications/Chat/.gitignore
@@ -145,4 +145,4 @@ docs/.build
 # wandb log
 example/wandb/
 
-examples/awesome-chatgpt-prompts/
+examples/awesome-chatgpt-prompts/
diff --git a/applications/Chat/coati/dataset/__init__.py b/applications/Chat/coati/dataset/__init__.py
@@ -1,9 +1,10 @@
 from .prompt_dataset import PromptDataset
 from .reward_dataset import HhRlhfDataset, RmStaticDataset
-from .sft_dataset import DataCollatorForSupervisedDataset, SFTDataset, SupervisedDataset
+from .sft_dataset import SFTDataset, SupervisedDataset
 from .utils import is_rank_0
 
 __all__ = [
-    'RmStaticDataset', 'HhRlhfDataset', 'is_rank_0', 'SFTDataset', 'SupervisedDataset',
-    'DataCollatorForSupervisedDataset', 'PromptDataset'
+    'RmStaticDataset', 'HhRlhfDataset',
+    'SFTDataset', 'SupervisedDataset',
+    'PromptDataset', 'is_rank_0',
 ]
diff --git a/applications/Chat/coati/dataset/conversation.py b/applications/Chat/coati/dataset/conversation.py
@@ -0,0 +1,87 @@
+#    Copyright 2023 lm-sys@FastChat
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+import dataclasses
+from enum import Enum, auto
+from typing import List
+
+
+class SeparatorStyle(Enum):
+    ADD_EOS_TOKEN = auto()
+
+
+@dataclasses.dataclass
+class Conversation:
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.ADD_EOS_TOKEN
+    sep: str = "</s>"
+
+    skip_next: bool = False
+
+    def get_prompt(self):
+        if self.sep_style == SeparatorStyle.ADD_EOS_TOKEN:
+            ret = self.system
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ": "
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+
+    def copy(self):
+        return Conversation(system=self.system,
+                            roles=self.roles,
+                            messages=[[x, y] for x, y in self.messages],
+                            offset=self.offset,
+                            sep_style=self.sep_style,
+                            sep=self.sep)
+
+    def dict(self):
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep
+        }
+
+
+conv = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+    roles=("Human", "Assistant"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.ADD_EOS_TOKEN,
+    sep="</s>",
+)
+
+default_conversation = conv
diff --git a/applications/Chat/coati/dataset/prompt_dataset.py b/applications/Chat/coati/dataset/prompt_dataset.py
@@ -1,20 +1,13 @@
-import copy
-import random
 from collections import defaultdict
-from dataclasses import dataclass, field
-from typing import Callable, Dict, Sequence
+from typing import Dict
 
 import torch
-import torch.distributed as dist
 import transformers
 from torch.utils.data import Dataset
-from tqdm import tqdm
 
 from colossalai.logging import get_dist_logger
 
-from .utils import is_rank_0, jload
-
-logger = get_dist_logger()
+from .utils import jload
 
 
 class PromptDataset(Dataset):
@@ -27,12 +20,13 @@ def __init__(self,
                  max_length: int = 96):
         super(PromptDataset, self).__init__()
         self.keyed_prompt = defaultdict(list)
-        logger.info("Loading data...")
+        self.logger = get_dist_logger()
+        self.logger.info("Loading data...")
         list_data_dict = jload(data_path)
-        logger.info(f"Loaded {len(list_data_dict)} examples.")
+        self.logger.info(f"Loaded {len(list_data_dict)} examples.")
 
         if max_datasets_size is not None:
-            logger.info(f"Limiting dataset to {max_datasets_size} examples.")
+            self.logger.info(f"Limiting dataset to {max_datasets_size} examples.")
             list_data_dict = list_data_dict[:max_datasets_size]
 
         instructions = [data_dict["instruction"] for data_dict in list_data_dict]