Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sync/main #5424

Merged
merged 39 commits into from
Mar 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
abd8e77
[extension] fixed exception catch (#5342)
FrankLeeeee Jan 31, 2024
c523984
[Chat] fix sft loss nan (#5345)
YeAnbang Feb 1, 2024
ffffc32
[checkpointio] fix gemini and hybrid parallel optim checkpoint (#5347)
ver217 Feb 1, 2024
1c790c0
[fix] remove unnecessary dp_size assert (#5351)
CWHer Feb 2, 2024
2dd01e3
[gemini] fix param op hook when output is tuple (#5355)
ver217 Feb 4, 2024
6c0fa7b
[llama] fix dataloader for hybrid parallel (#5358)
ver217 Feb 5, 2024
73f9f23
[llama] update training script (#5360)
ver217 Feb 5, 2024
a4cec17
[llama] add flash attn patch for npu (#5362)
ver217 Feb 5, 2024
44ca61a
[llama] fix neftune & pbar with start_step (#5364)
Camille7777 Feb 5, 2024
a5756a8
[eval] update llama npu eval (#5366)
Camille7777 Feb 6, 2024
eb4f2d9
[llama] polish training script and fix optim ckpt (#5368)
ver217 Feb 6, 2024
c53ddda
[lr-scheduler] fix load state dict and add test (#5369)
ver217 Feb 6, 2024
084c912
[llama] fix memory issue (#5371)
ver217 Feb 6, 2024
7d8e033
[moe] init mixtral impl
oahzxl Dec 14, 2023
c904d2a
[moe] update capacity computing (#5253)
ver217 Jan 11, 2024
da39d21
[moe] support mixtral (#5309)
ver217 Jan 25, 2024
b60be18
[moe] fix mixtral checkpoint io (#5314)
ver217 Jan 27, 2024
956b561
[moe] fix mixtral forward default value (#5329)
ver217 Jan 30, 2024
65e5d6b
[moe] fix mixtral optim checkpoint (#5344)
ver217 Feb 1, 2024
06db94f
[moe] fix tests
ver217 Feb 8, 2024
4c03347
Merge pull request #5377 from hpcaitech/example/llama-npu
FrankLeeeee Feb 8, 2024
efef43b
Merge pull request #5372 from hpcaitech/exp/mixtral
FrankLeeeee Feb 8, 2024
adae123
[release] update version (#5380)
ver217 Feb 8, 2024
7303801
[llama] fix training and inference scripts (#5384)
ver217 Feb 19, 2024
69e3ad0
[doc] Fix typo (#5361)
yixiaoer Feb 19, 2024
705a62a
[doc] updated installation command (#5389)
FrankLeeeee Feb 19, 2024
b833153
[hotfix] fix variable type for top_p (#5313)
CZYCW Feb 19, 2024
5d380a1
[hotfix] Fix wrong import in meta_registry (#5392)
stephankoe Feb 20, 2024
95c21e3
[extension] hotfix jit extension setup (#5402)
ver217 Feb 26, 2024
d882d18
[example] reuse flash attn patch (#5400)
ver217 Feb 27, 2024
bf34c6f
[fsdp] impl save/load shard model/optimizer (#5357)
ericxsun Feb 27, 2024
dcdd8a5
[setup] fixed nightly release (#5388)
FrankLeeeee Feb 27, 2024
0a25e16
[shardformer]gather llama logits (#5398)
flybird11111 Feb 27, 2024
a28c971
update requirements (#5407)
TongLi3701 Feb 28, 2024
2461f37
[workflow] added pypi channel (#5412)
FrankLeeeee Feb 29, 2024
5de940d
[doc] fix blog link
binmakeswell Feb 29, 2024
a1c6cdb
[doc] fix blog link
binmakeswell Feb 29, 2024
4b8312c
fix sft single turn inference example (#5416)
Camille7777 Mar 1, 2024
0310b76
Merge branch 'main' into sync/main
FrankLeeeee Mar 4, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions .github/workflows/release_nightly_on_schedule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,23 @@ on:
- cron: '0 0 * * 6' # release on every Sunday 00:00 UTC time

jobs:
build-n-publish:
publish:
if: github.repository == 'hpcaitech/ColossalAI'
name: Build and publish Python 🐍 distributions 📦 to PyPI
runs-on: ubuntu-latest
timeout-minutes: 20
outputs:
status: ${{ steps.publish.outcome }}
steps:
- uses: actions/checkout@v2

- uses: actions/setup-python@v2
with:
python-version: '3.8.14'

- run: NIGHTLY=1 python setup.py sdist build
- run: |
python .github/workflows/scripts/update_setup_for_nightly.py
python setup.py sdist build

# publish to PyPI if executed on the main branch
- name: Publish package to PyPI
Expand All @@ -31,7 +35,7 @@ jobs:

notify:
name: Notify Lark via webhook
needs: build-n-publish
needs: publish
runs-on: ubuntu-latest
if: ${{ always() }} && github.repository == 'hpcaitech/ColossalAI'
steps:
Expand Down Expand Up @@ -62,4 +66,4 @@ jobs:
REPO: ${{ github.repository }}
RUN_ID: ${{ github.run_id }}
WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}
STATUS: ${{ steps.publish.outcome }}
STATUS: ${{ needs.publish.outputs.status }}
2 changes: 1 addition & 1 deletion .github/workflows/release_test_pypi_before_merge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,6 @@ jobs:
# we need to install the requirements.txt first
# as test-pypi may not contain the distributions for libs listed in the txt file
pip install -r requirements/requirements.txt
pip install --index-url https://test.pypi.org/simple/ colossalai==$VERSION
pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.python.org/pypi colossalai==$VERSION
env:
VERSION: ${{ steps.prep-version.outputs.version }}
34 changes: 34 additions & 0 deletions .github/workflows/scripts/update_setup_for_nightly.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from datetime import datetime


def open_setup_file():
with open("setup.py", "r") as f:
file_lines = f.readlines()
return file_lines


def replace_nightly_package_info(file_lines):
version = datetime.today().strftime("%Y.%m.%d")
package_name = "colossalai-nightly"

for idx, line in enumerate(file_lines):
if "version = get_version()" in line:
file_lines[idx] = f'version = "{version}"\n'
if 'package_name = "colossalai"' in line:
file_lines[idx] = f'package_name = "{package_name}"\n'
return file_lines


def write_setup_file(file_lines):
with open("setup.py", "w") as f:
f.writelines(file_lines)


def main():
file_lines = open_setup_file()
file_lines = replace_nightly_package_info(file_lines)
write_setup_file(file_lines)


if __name__ == "__main__":
main()
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
<a href="https://www.colossalai.org/"> Documentation </a> |
<a href="https://github.com/hpcaitech/ColossalAI/tree/main/examples"> Examples </a> |
<a href="https://github.com/hpcaitech/ColossalAI/discussions"> Forum </a> |
<a href="https://medium.com/@hpcaitech"> Blog </a></h3>
<a href="https://hpc-ai.com/blog"> Blog </a></h3>

[![GitHub Repo stars](https://img.shields.io/github/stars/hpcaitech/ColossalAI?style=social)](https://github.com/hpcaitech/ColossalAI/stargazers)
[![Build](https://github.com/hpcaitech/ColossalAI/actions/workflows/build_on_schedule.yml/badge.svg)](https://github.com/hpcaitech/ColossalAI/actions/workflows/build_on_schedule.yml)
Expand Down Expand Up @@ -398,10 +398,10 @@ pip install colossalai

**Note: only Linux is supported for now.**

However, if you want to build the PyTorch extensions during installation, you can set `CUDA_EXT=1`.
However, if you want to build the PyTorch extensions during installation, you can set `BUILD_EXT=1`.

```bash
CUDA_EXT=1 pip install colossalai
BUILD_EXT=1 pip install colossalai
```

**Otherwise, CUDA kernels will be built during runtime when you actually need them.**
Expand Down Expand Up @@ -429,7 +429,7 @@ By default, we do not compile CUDA/C++ kernels. ColossalAI will build them durin
If you want to install and enable CUDA kernel fusion (compulsory installation when using fused optimizer):

```shell
CUDA_EXT=1 pip install .
BUILD_EXT=1 pip install .
```

For Users with CUDA 10.2, you can still build ColossalAI from source. However, you need to manually download the cub library and copy it to the corresponding directory.
Expand All @@ -445,7 +445,7 @@ unzip 1.8.0.zip
cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/

# install
CUDA_EXT=1 pip install .
BUILD_EXT=1 pip install .
```

<p align="right">(<a href="#top">back to top</a>)</p>
Expand Down
10 changes: 6 additions & 4 deletions applications/Chat/coati/dataset/sft_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,13 @@ def _preprocess(
max_length: int,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""Preprocess the data by tokenizing."""
sequences = [s + t for s, t in zip(sources, targets)]
sequences = [s + t + tokenizer.eos_token for s, t in zip(sources, targets)]
sequences_token = tokenizer(
sequences, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
sequences, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt", add_special_tokens=False
)

sources_token = tokenizer(
sources, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
sources, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt", add_special_tokens=False
)

assert sequences_token["attention_mask"].dim() == 2, "seq2seq model should be preprocessed differently"
Expand All @@ -65,7 +66,8 @@ def _preprocess(
if tokenizer.padding_side == "right":
# |prompt|completion|eos|pad|
labels[i][:source_len] = IGNORE_INDEX
labels[i][-pad_len:] = IGNORE_INDEX
if pad_len>0:
labels[i][-pad_len:] = IGNORE_INDEX
elif tokenizer.padding_side == "left":
# |pad|prompt|completion|eos|
labels[i][: pad_len + source_len] = IGNORE_INDEX
Expand Down
2 changes: 1 addition & 1 deletion applications/Chat/examples/train_sft.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
--accumulation_steps 8 \
--lr 2e-5 \
--max_datasets_size 512 \
--max_epochs 1
--max_epochs 1
68 changes: 10 additions & 58 deletions applications/Colossal-LLaMA-2/colossal_llama2/dataset/loader.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,16 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import numpy as np
import os
import random
from dataclasses import dataclass
from typing import Dict, List, Union, Sequence, Optional, Iterator, Callable
from typing import Dict, Iterator, List, Optional, Sequence, Union

import torch
from datasets import dataset_dict, load_from_disk
import torch.nn.functional as F
from datasets import Dataset as HFDataset
from torch.distributed import ProcessGroup
from torch.distributed.distributed_c10d import _get_default_group
from torch.utils.data import ConcatDataset, Dataset, DataLoader, DistributedSampler
from datasets import dataset_dict, load_from_disk
from torch.utils.data import ConcatDataset, Dataset, DistributedSampler
from transformers.tokenization_utils import PreTrainedTokenizer
import torch.nn.functional as F

DatasetType = Union[Dataset, ConcatDataset, dataset_dict.Dataset]
PathType = Union[str, os.PathLike]
Expand Down Expand Up @@ -62,6 +58,7 @@ class DataCollatorForSupervisedDataset(object):
tokenizer: PreTrainedTokenizer
max_length: int = 4096
ignore_index: int = -100
padding: str = "max_length"

def __call__(self, instances: Sequence[Dict[str, List[int]]]) -> Dict[str, torch.Tensor]:
"""
Expand Down Expand Up @@ -106,10 +103,11 @@ def __call__(self, instances: Sequence[Dict[str, List[int]]]) -> Dict[str, torch
batch_first=True,
padding_value=self.ignore_index,
) # (bsz, max_len)
# pad to max
to_pad = self.max_length - input_ids.size(1)
input_ids = F.pad(input_ids, (0, to_pad), value=self.tokenizer.pad_token_id)
labels = F.pad(labels, (0, to_pad), value=self.ignore_index)
if self.padding == "max_length":
# pad to max
to_pad = self.max_length - input_ids.size(1)
input_ids = F.pad(input_ids, (0, to_pad), value=self.tokenizer.pad_token_id)
labels = F.pad(labels, (0, to_pad), value=self.ignore_index)
elif self.tokenizer.padding_side == "left":
reversed_input_ids = [seq.flip(dims=(0,)) for seq in batch_input_ids]
reversed_input_ids = torch.nn.utils.rnn.pad_sequence(
Expand Down Expand Up @@ -171,49 +169,3 @@ def __len__(self) -> int:

def set_start_index(self, start_index: int) -> None:
self.start_index = start_index


def setup_distributed_dataloader(
dataset: DatasetType,
batch_size: int = 1,
shuffle: bool = False,
seed: int = 1024,
drop_last: bool = False,
pin_memory: bool = False,
num_workers: int = 0,
collate_fn: Callable[[Sequence[Dict[str, Union[str, List[int]]]]], Dict[str, torch.Tensor]] = None,
process_group: Optional[ProcessGroup] = None,
**kwargs,
) -> DataLoader:
"""
Setup dataloader for distributed training.
"""
_kwargs = kwargs.copy()
process_group = process_group or _get_default_group()
sampler = StatefulDistributedSampler(
dataset=dataset,
num_replicas=process_group.size(),
rank=process_group.rank(),
shuffle=shuffle,
seed=seed,
drop_last=drop_last,
)

# Deterministic dataloader
def seed_worker(worker_id: int) -> None:
worker_seed = seed
np.random.seed(worker_seed)
torch.manual_seed(worker_seed)
random.seed(worker_seed)

return DataLoader(
dataset=dataset,
batch_size=batch_size,
sampler=sampler,
num_workers=num_workers,
collate_fn=collate_fn,
pin_memory=pin_memory,
drop_last=drop_last,
worker_init_fn=seed_worker,
**_kwargs,
)
Loading
Loading