hpcaitech · FrankLeeeee · Mar 4, 2024 · Jan 31, 2024 · Feb 1, 2024 · Feb 1, 2024
@@ -6,19 +6,23 @@ on:
     - cron:  '0 0 * * 6' # release on every Sunday 00:00 UTC time
 
 jobs:
-  build-n-publish:
+  publish:
     if: github.repository == 'hpcaitech/ColossalAI'
     name: Build and publish Python 🐍 distributions 📦 to PyPI
     runs-on: ubuntu-latest
     timeout-minutes: 20
+    outputs:
+      status: ${{ steps.publish.outcome }}
     steps:
     - uses: actions/checkout@v2
 
     - uses: actions/setup-python@v2
       with:
         python-version: '3.8.14'
 
-    - run: NIGHTLY=1 python setup.py sdist build
+    - run: |
+        python .github/workflows/scripts/update_setup_for_nightly.py
+        python setup.py sdist build
 
     # publish to PyPI if executed on the main branch
     - name: Publish package to PyPI
@@ -31,7 +35,7 @@ jobs:
 
   notify:
     name: Notify Lark via webhook
-    needs: build-n-publish
+    needs: publish
     runs-on: ubuntu-latest
     if: ${{ always() }} && github.repository == 'hpcaitech/ColossalAI'
     steps:
@@ -62,4 +66,4 @@ jobs:
           REPO: ${{ github.repository }}
           RUN_ID: ${{ github.run_id }}
           WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}
-          STATUS: ${{ steps.publish.outcome }}
+          STATUS: ${{ needs.publish.outputs.status }}
@@ -49,6 +49,6 @@ jobs:
         # we need to install the requirements.txt first
         # as test-pypi may not contain the distributions for libs listed in the txt file
         pip install -r requirements/requirements.txt
-        pip install --index-url https://test.pypi.org/simple/ colossalai==$VERSION
+        pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.python.org/pypi colossalai==$VERSION
       env:
         VERSION: ${{ steps.prep-version.outputs.version }}
@@ -0,0 +1,34 @@
+from datetime import datetime
+
+
+def open_setup_file():
+    with open("setup.py", "r") as f:
+        file_lines = f.readlines()
+    return file_lines
+
+
+def replace_nightly_package_info(file_lines):
+    version = datetime.today().strftime("%Y.%m.%d")
+    package_name = "colossalai-nightly"
+
+    for idx, line in enumerate(file_lines):
+        if "version = get_version()" in line:
+            file_lines[idx] = f'version = "{version}"\n'
+        if 'package_name = "colossalai"' in line:
+            file_lines[idx] = f'package_name = "{package_name}"\n'
+    return file_lines
+
+
+def write_setup_file(file_lines):
+    with open("setup.py", "w") as f:
+        f.writelines(file_lines)
+
+
+def main():
+    file_lines = open_setup_file()
+    file_lines = replace_nightly_package_info(file_lines)
+    write_setup_file(file_lines)
+
+
+if __name__ == "__main__":
+    main()
@@ -9,7 +9,7 @@
    <a href="https://www.colossalai.org/"> Documentation </a> |
    <a href="https://github.com/hpcaitech/ColossalAI/tree/main/examples"> Examples </a> |
    <a href="https://github.com/hpcaitech/ColossalAI/discussions"> Forum </a> |
-   <a href="https://medium.com/@hpcaitech"> Blog </a></h3>
+   <a href="https://hpc-ai.com/blog"> Blog </a></h3>
 
    [![GitHub Repo stars](https://img.shields.io/github/stars/hpcaitech/ColossalAI?style=social)](https://github.com/hpcaitech/ColossalAI/stargazers)
    [![Build](https://github.com/hpcaitech/ColossalAI/actions/workflows/build_on_schedule.yml/badge.svg)](https://github.com/hpcaitech/ColossalAI/actions/workflows/build_on_schedule.yml)
@@ -398,10 +398,10 @@ pip install colossalai
 
 **Note: only Linux is supported for now.**
 
-However, if you want to build the PyTorch extensions during installation, you can set `CUDA_EXT=1`.
+However, if you want to build the PyTorch extensions during installation, you can set `BUILD_EXT=1`.
 
 ```bash
-CUDA_EXT=1 pip install colossalai
+BUILD_EXT=1 pip install colossalai
 ```
 
 **Otherwise, CUDA kernels will be built during runtime when you actually need them.**
@@ -429,7 +429,7 @@ By default, we do not compile CUDA/C++ kernels. ColossalAI will build them durin
 If you want to install and enable CUDA kernel fusion (compulsory installation when using fused optimizer):
 
 ```shell
-CUDA_EXT=1 pip install .
+BUILD_EXT=1 pip install .
 ```
 
 For Users with CUDA 10.2, you can still build ColossalAI from source. However, you need to manually download the cub library and copy it to the corresponding directory.
@@ -445,7 +445,7 @@ unzip 1.8.0.zip
 cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
 
 # install
-CUDA_EXT=1 pip install .
+BUILD_EXT=1 pip install .
 ```
 
 <p align="right">(<a href="#top">back to top</a>)</p>

@@ -49,12 +49,13 @@ def _preprocess(
     max_length: int,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """Preprocess the data by tokenizing."""
-    sequences = [s + t for s, t in zip(sources, targets)]
+    sequences = [s + t + tokenizer.eos_token for s, t in zip(sources, targets)]
     sequences_token = tokenizer(
-        sequences, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
+        sequences, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt", add_special_tokens=False
     )
+
     sources_token = tokenizer(
-        sources, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
+        sources, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt", add_special_tokens=False
     )
 
     assert sequences_token["attention_mask"].dim() == 2, "seq2seq model should be preprocessed differently"
@@ -65,7 +66,8 @@ def _preprocess(
         if tokenizer.padding_side == "right":
             # |prompt|completion|eos|pad|
             labels[i][:source_len] = IGNORE_INDEX
-            labels[i][-pad_len:] = IGNORE_INDEX
+            if pad_len>0:
+                labels[i][-pad_len:] = IGNORE_INDEX
         elif tokenizer.padding_side == "left":
             # |pad|prompt|completion|eos|
             labels[i][: pad_len + source_len] = IGNORE_INDEX

@@ -25,4 +25,4 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
     --accumulation_steps 8 \
     --lr 2e-5 \
     --max_datasets_size 512 \
-    --max_epochs 1
+    --max_epochs 1
@@ -1,20 +1,16 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
-import numpy as np
 import os
-import random
 from dataclasses import dataclass
-from typing import Dict, List, Union, Sequence, Optional, Iterator, Callable
+from typing import Dict, Iterator, List, Optional, Sequence, Union
 
 import torch
-from datasets import dataset_dict, load_from_disk
+import torch.nn.functional as F
 from datasets import Dataset as HFDataset
-from torch.distributed import ProcessGroup
-from torch.distributed.distributed_c10d import _get_default_group
-from torch.utils.data import ConcatDataset, Dataset, DataLoader, DistributedSampler
+from datasets import dataset_dict, load_from_disk
+from torch.utils.data import ConcatDataset, Dataset, DistributedSampler
 from transformers.tokenization_utils import PreTrainedTokenizer
-import torch.nn.functional as F
 
 DatasetType = Union[Dataset, ConcatDataset, dataset_dict.Dataset]
 PathType = Union[str, os.PathLike]
@@ -62,6 +58,7 @@ class DataCollatorForSupervisedDataset(object):
     tokenizer: PreTrainedTokenizer
     max_length: int = 4096
     ignore_index: int = -100
+    padding: str = "max_length"
 
     def __call__(self, instances: Sequence[Dict[str, List[int]]]) -> Dict[str, torch.Tensor]:
         """
@@ -106,10 +103,11 @@ def __call__(self, instances: Sequence[Dict[str, List[int]]]) -> Dict[str, torch
                 batch_first=True,
                 padding_value=self.ignore_index,
             )  # (bsz, max_len)
-            # pad to max
-            to_pad = self.max_length - input_ids.size(1)
-            input_ids = F.pad(input_ids, (0, to_pad), value=self.tokenizer.pad_token_id)
-            labels = F.pad(labels, (0, to_pad), value=self.ignore_index)
+            if self.padding == "max_length":
+                # pad to max
+                to_pad = self.max_length - input_ids.size(1)
+                input_ids = F.pad(input_ids, (0, to_pad), value=self.tokenizer.pad_token_id)
+                labels = F.pad(labels, (0, to_pad), value=self.ignore_index)
         elif self.tokenizer.padding_side == "left":
             reversed_input_ids = [seq.flip(dims=(0,)) for seq in batch_input_ids]
             reversed_input_ids = torch.nn.utils.rnn.pad_sequence(
@@ -171,49 +169,3 @@ def __len__(self) -> int:
 
     def set_start_index(self, start_index: int) -> None:
         self.start_index = start_index
-
-
-def setup_distributed_dataloader(
-    dataset: DatasetType,
-    batch_size: int = 1,
-    shuffle: bool = False,
-    seed: int = 1024,
-    drop_last: bool = False,
-    pin_memory: bool = False,
-    num_workers: int = 0,
-    collate_fn: Callable[[Sequence[Dict[str, Union[str, List[int]]]]], Dict[str, torch.Tensor]] = None,
-    process_group: Optional[ProcessGroup] = None,
-    **kwargs,
-) -> DataLoader:
-    """
-    Setup dataloader for distributed training.
-    """
-    _kwargs = kwargs.copy()
-    process_group = process_group or _get_default_group()
-    sampler = StatefulDistributedSampler(
-        dataset=dataset,
-        num_replicas=process_group.size(),
-        rank=process_group.rank(),
-        shuffle=shuffle,
-        seed=seed,
-        drop_last=drop_last,
-    )
-
-    # Deterministic dataloader
-    def seed_worker(worker_id: int) -> None:
-        worker_seed = seed
-        np.random.seed(worker_seed)
-        torch.manual_seed(worker_seed)
-        random.seed(worker_seed)
-
-    return DataLoader(
-        dataset=dataset,
-        batch_size=batch_size,
-        sampler=sampler,
-        num_workers=num_workers,
-        collate_fn=collate_fn,
-        pin_memory=pin_memory,
-        drop_last=drop_last,
-        worker_init_fn=seed_worker,
-        **_kwargs,
-    )