Skip to content

Commit

Permalink
dev: added cuda 12.4 build support (#255)
Browse files Browse the repository at this point in the history
  • Loading branch information
guocuimi committed Jun 30, 2024
1 parent 3c514fc commit b3ff0a9
Show file tree
Hide file tree
Showing 14 changed files with 115 additions and 15 deletions.
16 changes: 16 additions & 0 deletions .github/workflows/publish_devel_image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,22 @@ jobs:
- name: Create cache directory
run: mkdir -p $CI_CACHE_DIR/.buildx-cache

- name: Build devel image for cuda 12.4
uses: docker/build-push-action@v5
with:
context: ./docker
file: ./docker/Dockerfile.devel
push: true
cache-from: type=local,src=$CI_CACHE_DIR/.buildx-cache
cache-to: type=local,dest=$CI_CACHE_DIR/.buildx-cache
build-args: |
UBUNTU_VERSION=22.04
CUDA_VERSION=12.4
GCC_VERSION=12
tags: |
vectorchai/scalellm_devel:cuda12.4-ubuntu22.04
vectorchai/scalellm_devel:cuda12.4
- name: Build devel image for cuda 12.1
uses: docker/build-push-action@v5
with:
Expand Down
13 changes: 13 additions & 0 deletions .github/workflows/publish_manylinux_image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,19 @@ jobs:
- name: Create cache directory
run: mkdir -p $CI_CACHE_DIR/.buildx-cache

- name: Build base for cuda 12.4
uses: docker/build-push-action@v5
with:
context: ./docker
file: ./docker/Dockerfile.manylinux
push: true
cache-from: type=local,src=$CI_CACHE_DIR/.buildx-cache
cache-to: type=local,dest=$CI_CACHE_DIR/.buildx-cache
build-args: |
CUDA_VERSION=12.4
tags: |
vectorchai/scalellm_manylinux:cuda12.4
- name: Build base for cuda 12.1
uses: docker/build-push-action@v5
with:
Expand Down
9 changes: 8 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,14 @@ if (DEFINED ENV{LIBTORCH_ROOT})
message(STATUS "Using libtorch at $ENV{LIBTORCH_ROOT}")
else()
include(FetchContent)
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.1)
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.4)
# download nightly libtorch with cuda 12.4 from pytorch.org
if (USE_CXX11_ABI)
set(LIBTORCH_URL "https://download.pytorch.org/libtorch/nightly/cu124/libtorch-cxx11-abi-shared-with-deps-latest.zip")
else()
set(LIBTORCH_URL "https://download.pytorch.org/libtorch/nightly/cu124/libtorch-shared-with-deps-latest.zip")
endif()
elseif(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.1)
# download libtorch 2.3 with cuda 12.1 from pytorch.org
if (USE_CXX11_ABI)
set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu121/libtorch-cxx11-abi-shared-with-deps-2.3.0%2Bcu121.zip")
Expand Down
6 changes: 3 additions & 3 deletions docker/Dockerfile.base
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:${UBUNTU_VERSION}

LABEL maintainer="[email protected]"
ENV DEBIAN_FRONTEND noninteractive
ENV DEBIAN_FRONTEND=noninteractive

# Install common dependencies
COPY ./common/install_base.sh install_base.sh
Expand All @@ -25,8 +25,8 @@ RUN rm install_python.sh
ARG CUDA_VERSION=12.1
COPY ./common/install_cuda.sh install_cuda.sh
RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
ENV DESIRED_CUDA ${CUDA_VERSION}
ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
ENV DESIRED_CUDA=${CUDA_VERSION}
ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH

# Install gcc
ARG GCC_VERSION=11
Expand Down
6 changes: 3 additions & 3 deletions docker/Dockerfile.devel
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:${UBUNTU_VERSION}

LABEL maintainer="[email protected]"
ENV DEBIAN_FRONTEND noninteractive
ENV DEBIAN_FRONTEND=noninteractive

# Install common dependencies
COPY ./common/install_base.sh install_base.sh
Expand Down Expand Up @@ -34,8 +34,8 @@ RUN rm install_ninja.sh
ARG CUDA_VERSION=12.1
COPY ./common/install_cuda.sh install_cuda.sh
RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
ENV DESIRED_CUDA ${CUDA_VERSION}
ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
ENV DESIRED_CUDA=${CUDA_VERSION}
ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH

# install rust
ENV RUSTUP_HOME=/usr/local/rustup
Expand Down
8 changes: 4 additions & 4 deletions docker/Dockerfile.manylinux
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ ARG CUDA_VERSION=12.1
FROM pytorch/manylinux-builder:cuda${CUDA_VERSION} as base

LABEL maintainer="[email protected]"
ENV DEBIAN_FRONTEND noninteractive
ENV DEBIAN_FRONTEND=noninteractive

ENV LC_ALL en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US.UTF-8
ENV LC_ALL=en_US.UTF-8
ENV LANG=en_US.UTF-8
ENV LANGUAGE=en_US.UTF-8

# Install dependencies for vcpkg
RUN yum -y update && yum -y install \
Expand Down
8 changes: 4 additions & 4 deletions docker/Dockerfile.manylinux_2_28
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
FROM quay.io/pypa/manylinux_2_28_x86_64 as base

LABEL maintainer="[email protected]"
ENV DEBIAN_FRONTEND noninteractive
ENV DEBIAN_FRONTEND=noninteractive

ENV LC_ALL en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US.UTF-8
ENV LC_ALL=en_US.UTF-8
ENV LANG=en_US.UTF-8
ENV LANGUAGE=en_US.UTF-8

# Install common dependencies
COPY ./common/install_base.sh install_base.sh
Expand Down
4 changes: 4 additions & 0 deletions src/layers/activation_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ TEST_P(ActivationTest, Basic) {
const auto& [device, dtype, activation, in_features, out_features] =
GetParam();

if (device.is_cuda() && !torch::cuda::is_available()) {
GTEST_SKIP() << "CUDA not available, skipping test";
}

auto input = torch::rand({in_features, out_features},
torch::dtype(dtype).device(device));

Expand Down
7 changes: 7 additions & 0 deletions src/layers/attention/attention_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ TEST_P(AttentionPrefillTest, Varlen) {
head_dim,
scale,
alibi] = GetParam();
if (device.is_cuda() && !torch::cuda::is_available()) {
GTEST_SKIP() << "CUDA not available, skipping test";
}

absl::BitGen gen;

Expand Down Expand Up @@ -180,6 +183,10 @@ TEST_P(AttentionDecodeTest, KVCache) {
head_dim,
scale,
alibi] = GetParam();
if (device.is_cuda() && !torch::cuda::is_available()) {
GTEST_SKIP() << "CUDA not available, skipping test";
}

// make sure kv_max_seq_len >= q_max_seq_len
if (kv_max_seq_len < q_max_seq_len) {
GTEST_SKIP() << "kv_max_seq_len < q_max_seq_len";
Expand Down
12 changes: 12 additions & 0 deletions src/layers/normalization_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ TEST(NormalizationTest, LayerNorm) {
}

TEST(NormalizationTest, LayerNormKernel) {
if (!torch::cuda::is_available()) {
GTEST_SKIP() << "CUDA not available, skipping test";
}

// TODO: test other device and dtype combinations
const auto dtype = torch::kHalf;
const auto device = torch::kCUDA;
Expand Down Expand Up @@ -98,6 +102,10 @@ TEST(NormalizationTest, RMSNorm) {
}

TEST(NormalizationTest, RMSNormKernel) {
if (!torch::cuda::is_available()) {
GTEST_SKIP() << "CUDA not available, skipping test";
}

// TODO: test other device and dtype combinations
const auto dtype = torch::kHalf;
const auto device = torch::kCUDA;
Expand Down Expand Up @@ -125,6 +133,10 @@ TEST(NormalizationTest, RMSNormKernel) {
}

TEST(NormalizationTest, RMSNormResidualKernel) {
if (!torch::cuda::is_available()) {
GTEST_SKIP() << "CUDA not available, skipping test";
}

const auto dtype = torch::kHalf;
const auto device = torch::kCUDA;
const auto options = torch::dtype(dtype).device(device);
Expand Down
8 changes: 8 additions & 0 deletions src/layers/pos_embedding_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,10 @@ TEST_P(PosEmbeddingTest, Rotary) {
theta,
interleaved,
max_position_embeddings] = GetParam();
if (device.is_cuda() && !torch::cuda::is_available()) {
GTEST_SKIP() << "CUDA not available, skipping test";
}

const auto options = torch::dtype(dtype).device(device);

// prepare inputs
Expand Down Expand Up @@ -194,6 +198,10 @@ TEST_P(PosEmbeddingKernelTest, Rotary) {
interleaved,
max_position_embeddings] = GetParam();

if (device.is_cuda() && !torch::cuda::is_available()) {
GTEST_SKIP() << "CUDA not available, skipping test";
}

const auto options = torch::dtype(dtype).device(device);
// prepare inputs
torch::Tensor query = torch::rand({num_tokens, n_heads, head_dim}, options);
Expand Down
8 changes: 8 additions & 0 deletions src/memory/kv_cache_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ TEST(KVCacheTest, Empty) {
}

TEST(KVCacheTest, Basic) {
if (!torch::cuda::is_available()) {
GTEST_SKIP() << "CUDA not available, skipping test";
}

const int num_kv_heads = 32;
const int head_dim = 128;
const int block_size = 8;
Expand Down Expand Up @@ -57,6 +61,10 @@ TEST(KVCacheTest, Basic) {
}

TEST(KVCacheTest, Random) {
if (!torch::cuda::is_available()) {
GTEST_SKIP() << "CUDA not available, skipping test";
}

const int64_t num_kv_heads = 12;
const int64_t head_dim = 128;
const int64_t block_size = 4;
Expand Down
8 changes: 8 additions & 0 deletions src/quantization/qlinear_impl_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ TEST(QlinearTest, Basic) {
}

TEST(QlinearTest, ColumnParallelQuantLinear) {
if (!torch::cuda::is_available()) {
GTEST_SKIP() << "CUDA not available, skipping test";
}

const int64_t in_features = 4096;
const int64_t out_features = 4096;
QuantArgs quant_args;
Expand Down Expand Up @@ -57,6 +61,10 @@ TEST(QlinearTest, ColumnParallelQuantLinear) {
}

TEST(QlinearTest, RowParallelQuantLinear) {
if (!torch::cuda::is_available()) {
GTEST_SKIP() << "CUDA not available, skipping test";
}

const int64_t in_features = 4096;
const int64_t out_features = 4096;
QuantArgs quant_args;
Expand Down
17 changes: 17 additions & 0 deletions src/sampling/logits_processor_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ TEST(LogitsProcessorTest, Temperature) {
}

TEST(LogitsProcessorTest, TemperatureKernel) {
if (!torch::cuda::is_available()) {
GTEST_SKIP() << "CUDA not available, skipping test";
}
// Test TemperatureLogitsProcessor
torch::ScalarType dtype(torch::kHalf);
torch::Device device(torch::kCUDA);
Expand Down Expand Up @@ -121,6 +124,9 @@ TEST(LogitsProcessorTest, FrequencyPresencePenalty) {
}

TEST(LogitsProcessorTest, FrequencyPresencePenaltyKernel) {
if (!torch::cuda::is_available()) {
GTEST_SKIP() << "CUDA not available, skipping test";
}
// Test FrequencyPresencePenaltyLogitsProcessor
torch::ScalarType dtype(torch::kHalf);
torch::Device device(torch::kCUDA);
Expand Down Expand Up @@ -209,6 +215,9 @@ TEST(LogitsProcessorTest, RepetitionPenalty) {
}

TEST(LogitsProcessorTest, RepetitionPenaltyKernel) {
if (!torch::cuda::is_available()) {
GTEST_SKIP() << "CUDA not available, skipping test";
}
// Test RepetitionPenaltyLogitsProcessor
torch::ScalarType dtype(torch::kHalf);
torch::Device device(torch::kCUDA);
Expand Down Expand Up @@ -244,6 +253,10 @@ TEST(LogitsProcessorTest, RepetitionPenaltyKernel) {
}

TEST(LogitsProcessorTest, TopK) {
if (!torch::cuda::is_available()) {
GTEST_SKIP() << "CUDA not available, skipping test";
}

// Set the random seed
torch::manual_seed(100);
torch::ScalarType dtype(torch::kHalf);
Expand Down Expand Up @@ -284,6 +297,10 @@ TEST(LogitsProcessorTest, TopK) {
}

TEST(LogitsProcessorTest, TopP) {
if (!torch::cuda::is_available()) {
GTEST_SKIP() << "CUDA not available, skipping test";
}

// Set the random seed
torch::manual_seed(100);
torch::ScalarType dtype(torch::kHalf);
Expand Down

0 comments on commit b3ff0a9

Please sign in to comment.