diff --git a/.github/workflows/publish_devel_image.yml b/.github/workflows/publish_devel_image.yml index 7cf50d44..2ad72132 100644 --- a/.github/workflows/publish_devel_image.yml +++ b/.github/workflows/publish_devel_image.yml @@ -27,6 +27,22 @@ jobs: - name: Create cache directory run: mkdir -p $CI_CACHE_DIR/.buildx-cache + - name: Build devel image for cuda 12.4 + uses: docker/build-push-action@v5 + with: + context: ./docker + file: ./docker/Dockerfile.devel + push: true + cache-from: type=local,src=$CI_CACHE_DIR/.buildx-cache + cache-to: type=local,dest=$CI_CACHE_DIR/.buildx-cache + build-args: | + UBUNTU_VERSION=22.04 + CUDA_VERSION=12.4 + GCC_VERSION=12 + tags: | + vectorchai/scalellm_devel:cuda12.4-ubuntu22.04 + vectorchai/scalellm_devel:cuda12.4 + - name: Build devel image for cuda 12.1 uses: docker/build-push-action@v5 with: diff --git a/.github/workflows/publish_manylinux_image.yml b/.github/workflows/publish_manylinux_image.yml index a4771175..10acbd5f 100644 --- a/.github/workflows/publish_manylinux_image.yml +++ b/.github/workflows/publish_manylinux_image.yml @@ -27,6 +27,19 @@ jobs: - name: Create cache directory run: mkdir -p $CI_CACHE_DIR/.buildx-cache + - name: Build base for cuda 12.4 + uses: docker/build-push-action@v5 + with: + context: ./docker + file: ./docker/Dockerfile.manylinux + push: true + cache-from: type=local,src=$CI_CACHE_DIR/.buildx-cache + cache-to: type=local,dest=$CI_CACHE_DIR/.buildx-cache + build-args: | + CUDA_VERSION=12.4 + tags: | + vectorchai/scalellm_manylinux:cuda12.4 + - name: Build base for cuda 12.1 uses: docker/build-push-action@v5 with: diff --git a/CMakeLists.txt b/CMakeLists.txt index 8b8741b7..e05e86f2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -189,7 +189,14 @@ if (DEFINED ENV{LIBTORCH_ROOT}) message(STATUS "Using libtorch at $ENV{LIBTORCH_ROOT}") else() include(FetchContent) - if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.1) + if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.4) + # download nightly libtorch with cuda 12.4 from pytorch.org + if (USE_CXX11_ABI) + set(LIBTORCH_URL "https://download.pytorch.org/libtorch/nightly/cu124/libtorch-cxx11-abi-shared-with-deps-latest.zip") + else() + set(LIBTORCH_URL "https://download.pytorch.org/libtorch/nightly/cu124/libtorch-shared-with-deps-latest.zip") + endif() + elseif(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.1) # download libtorch 2.3 with cuda 12.1 from pytorch.org if (USE_CXX11_ABI) set(LIBTORCH_URL "https://download.pytorch.org/libtorch/cu121/libtorch-cxx11-abi-shared-with-deps-2.3.0%2Bcu121.zip") diff --git a/docker/Dockerfile.base b/docker/Dockerfile.base index cf53d463..cc130f19 100644 --- a/docker/Dockerfile.base +++ b/docker/Dockerfile.base @@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04 FROM ubuntu:${UBUNTU_VERSION} LABEL maintainer="mi@vectorch.com" -ENV DEBIAN_FRONTEND noninteractive +ENV DEBIAN_FRONTEND=noninteractive # Install common dependencies COPY ./common/install_base.sh install_base.sh @@ -25,8 +25,8 @@ RUN rm install_python.sh ARG CUDA_VERSION=12.1 COPY ./common/install_cuda.sh install_cuda.sh RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh -ENV DESIRED_CUDA ${CUDA_VERSION} -ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH +ENV DESIRED_CUDA=${CUDA_VERSION} +ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH # Install gcc ARG GCC_VERSION=11 diff --git a/docker/Dockerfile.devel b/docker/Dockerfile.devel index 8465ba38..2f8de99b 100644 --- a/docker/Dockerfile.devel +++ b/docker/Dockerfile.devel @@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04 FROM ubuntu:${UBUNTU_VERSION} LABEL maintainer="mi@vectorch.com" -ENV DEBIAN_FRONTEND noninteractive +ENV DEBIAN_FRONTEND=noninteractive # Install common dependencies COPY ./common/install_base.sh install_base.sh @@ -34,8 +34,8 @@ RUN rm install_ninja.sh ARG CUDA_VERSION=12.1 COPY ./common/install_cuda.sh install_cuda.sh RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh -ENV DESIRED_CUDA ${CUDA_VERSION} -ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH +ENV DESIRED_CUDA=${CUDA_VERSION} +ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH # install rust ENV RUSTUP_HOME=/usr/local/rustup diff --git a/docker/Dockerfile.manylinux b/docker/Dockerfile.manylinux index d4d90cf4..8b3e9041 100644 --- a/docker/Dockerfile.manylinux +++ b/docker/Dockerfile.manylinux @@ -2,11 +2,11 @@ ARG CUDA_VERSION=12.1 FROM pytorch/manylinux-builder:cuda${CUDA_VERSION} as base LABEL maintainer="mi@vectorch.com" -ENV DEBIAN_FRONTEND noninteractive +ENV DEBIAN_FRONTEND=noninteractive -ENV LC_ALL en_US.UTF-8 -ENV LANG en_US.UTF-8 -ENV LANGUAGE en_US.UTF-8 +ENV LC_ALL=en_US.UTF-8 +ENV LANG=en_US.UTF-8 +ENV LANGUAGE=en_US.UTF-8 # Install dependencies for vcpkg RUN yum -y update && yum -y install \ diff --git a/docker/Dockerfile.manylinux_2_28 b/docker/Dockerfile.manylinux_2_28 index 6464816d..e2ce3358 100644 --- a/docker/Dockerfile.manylinux_2_28 +++ b/docker/Dockerfile.manylinux_2_28 @@ -1,11 +1,11 @@ FROM quay.io/pypa/manylinux_2_28_x86_64 as base LABEL maintainer="mi@vectorch.com" -ENV DEBIAN_FRONTEND noninteractive +ENV DEBIAN_FRONTEND=noninteractive -ENV LC_ALL en_US.UTF-8 -ENV LANG en_US.UTF-8 -ENV LANGUAGE en_US.UTF-8 +ENV LC_ALL=en_US.UTF-8 +ENV LANG=en_US.UTF-8 +ENV LANGUAGE=en_US.UTF-8 # Install common dependencies COPY ./common/install_base.sh install_base.sh diff --git a/src/layers/activation_test.cpp b/src/layers/activation_test.cpp index 4d2f803c..d62b2608 100644 --- a/src/layers/activation_test.cpp +++ b/src/layers/activation_test.cpp @@ -36,6 +36,10 @@ TEST_P(ActivationTest, Basic) { const auto& [device, dtype, activation, in_features, out_features] = GetParam(); + if (device.is_cuda() && !torch::cuda::is_available()) { + GTEST_SKIP() << "CUDA not available, skipping test"; + } + auto input = torch::rand({in_features, out_features}, torch::dtype(dtype).device(device)); diff --git a/src/layers/attention/attention_test.cpp b/src/layers/attention/attention_test.cpp index f10b5774..1fc90d68 100644 --- a/src/layers/attention/attention_test.cpp +++ b/src/layers/attention/attention_test.cpp @@ -92,6 +92,9 @@ TEST_P(AttentionPrefillTest, Varlen) { head_dim, scale, alibi] = GetParam(); + if (device.is_cuda() && !torch::cuda::is_available()) { + GTEST_SKIP() << "CUDA not available, skipping test"; + } absl::BitGen gen; @@ -180,6 +183,10 @@ TEST_P(AttentionDecodeTest, KVCache) { head_dim, scale, alibi] = GetParam(); + if (device.is_cuda() && !torch::cuda::is_available()) { + GTEST_SKIP() << "CUDA not available, skipping test"; + } + // make sure kv_max_seq_len >= q_max_seq_len if (kv_max_seq_len < q_max_seq_len) { GTEST_SKIP() << "kv_max_seq_len < q_max_seq_len"; diff --git a/src/layers/normalization_test.cpp b/src/layers/normalization_test.cpp index 7d2d8645..5b239c92 100644 --- a/src/layers/normalization_test.cpp +++ b/src/layers/normalization_test.cpp @@ -37,6 +37,10 @@ TEST(NormalizationTest, LayerNorm) { } TEST(NormalizationTest, LayerNormKernel) { + if (!torch::cuda::is_available()) { + GTEST_SKIP() << "CUDA not available, skipping test"; + } + // TODO: test other device and dtype combinations const auto dtype = torch::kHalf; const auto device = torch::kCUDA; @@ -98,6 +102,10 @@ TEST(NormalizationTest, RMSNorm) { } TEST(NormalizationTest, RMSNormKernel) { + if (!torch::cuda::is_available()) { + GTEST_SKIP() << "CUDA not available, skipping test"; + } + // TODO: test other device and dtype combinations const auto dtype = torch::kHalf; const auto device = torch::kCUDA; @@ -125,6 +133,10 @@ TEST(NormalizationTest, RMSNormKernel) { } TEST(NormalizationTest, RMSNormResidualKernel) { + if (!torch::cuda::is_available()) { + GTEST_SKIP() << "CUDA not available, skipping test"; + } + const auto dtype = torch::kHalf; const auto device = torch::kCUDA; const auto options = torch::dtype(dtype).device(device); diff --git a/src/layers/pos_embedding_test.cpp b/src/layers/pos_embedding_test.cpp index 715dafcf..dcff3fb5 100644 --- a/src/layers/pos_embedding_test.cpp +++ b/src/layers/pos_embedding_test.cpp @@ -116,6 +116,10 @@ TEST_P(PosEmbeddingTest, Rotary) { theta, interleaved, max_position_embeddings] = GetParam(); + if (device.is_cuda() && !torch::cuda::is_available()) { + GTEST_SKIP() << "CUDA not available, skipping test"; + } + const auto options = torch::dtype(dtype).device(device); // prepare inputs @@ -194,6 +198,10 @@ TEST_P(PosEmbeddingKernelTest, Rotary) { interleaved, max_position_embeddings] = GetParam(); + if (device.is_cuda() && !torch::cuda::is_available()) { + GTEST_SKIP() << "CUDA not available, skipping test"; + } + const auto options = torch::dtype(dtype).device(device); // prepare inputs torch::Tensor query = torch::rand({num_tokens, n_heads, head_dim}, options); diff --git a/src/memory/kv_cache_test.cpp b/src/memory/kv_cache_test.cpp index 609558f3..0edbe1eb 100644 --- a/src/memory/kv_cache_test.cpp +++ b/src/memory/kv_cache_test.cpp @@ -14,6 +14,10 @@ TEST(KVCacheTest, Empty) { } TEST(KVCacheTest, Basic) { + if (!torch::cuda::is_available()) { + GTEST_SKIP() << "CUDA not available, skipping test"; + } + const int num_kv_heads = 32; const int head_dim = 128; const int block_size = 8; @@ -57,6 +61,10 @@ TEST(KVCacheTest, Basic) { } TEST(KVCacheTest, Random) { + if (!torch::cuda::is_available()) { + GTEST_SKIP() << "CUDA not available, skipping test"; + } + const int64_t num_kv_heads = 12; const int64_t head_dim = 128; const int64_t block_size = 4; diff --git a/src/quantization/qlinear_impl_test.cpp b/src/quantization/qlinear_impl_test.cpp index ed160bc5..89cd749d 100644 --- a/src/quantization/qlinear_impl_test.cpp +++ b/src/quantization/qlinear_impl_test.cpp @@ -23,6 +23,10 @@ TEST(QlinearTest, Basic) { } TEST(QlinearTest, ColumnParallelQuantLinear) { + if (!torch::cuda::is_available()) { + GTEST_SKIP() << "CUDA not available, skipping test"; + } + const int64_t in_features = 4096; const int64_t out_features = 4096; QuantArgs quant_args; @@ -57,6 +61,10 @@ TEST(QlinearTest, ColumnParallelQuantLinear) { } TEST(QlinearTest, RowParallelQuantLinear) { + if (!torch::cuda::is_available()) { + GTEST_SKIP() << "CUDA not available, skipping test"; + } + const int64_t in_features = 4096; const int64_t out_features = 4096; QuantArgs quant_args; diff --git a/src/sampling/logits_processor_test.cpp b/src/sampling/logits_processor_test.cpp index f4d3ccc6..7c4fd803 100644 --- a/src/sampling/logits_processor_test.cpp +++ b/src/sampling/logits_processor_test.cpp @@ -52,6 +52,9 @@ TEST(LogitsProcessorTest, Temperature) { } TEST(LogitsProcessorTest, TemperatureKernel) { + if (!torch::cuda::is_available()) { + GTEST_SKIP() << "CUDA not available, skipping test"; + } // Test TemperatureLogitsProcessor torch::ScalarType dtype(torch::kHalf); torch::Device device(torch::kCUDA); @@ -121,6 +124,9 @@ TEST(LogitsProcessorTest, FrequencyPresencePenalty) { } TEST(LogitsProcessorTest, FrequencyPresencePenaltyKernel) { + if (!torch::cuda::is_available()) { + GTEST_SKIP() << "CUDA not available, skipping test"; + } // Test FrequencyPresencePenaltyLogitsProcessor torch::ScalarType dtype(torch::kHalf); torch::Device device(torch::kCUDA); @@ -209,6 +215,9 @@ TEST(LogitsProcessorTest, RepetitionPenalty) { } TEST(LogitsProcessorTest, RepetitionPenaltyKernel) { + if (!torch::cuda::is_available()) { + GTEST_SKIP() << "CUDA not available, skipping test"; + } // Test RepetitionPenaltyLogitsProcessor torch::ScalarType dtype(torch::kHalf); torch::Device device(torch::kCUDA); @@ -244,6 +253,10 @@ TEST(LogitsProcessorTest, RepetitionPenaltyKernel) { } TEST(LogitsProcessorTest, TopK) { + if (!torch::cuda::is_available()) { + GTEST_SKIP() << "CUDA not available, skipping test"; + } + // Set the random seed torch::manual_seed(100); torch::ScalarType dtype(torch::kHalf); @@ -284,6 +297,10 @@ TEST(LogitsProcessorTest, TopK) { } TEST(LogitsProcessorTest, TopP) { + if (!torch::cuda::is_available()) { + GTEST_SKIP() << "CUDA not available, skipping test"; + } + // Set the random seed torch::manual_seed(100); torch::ScalarType dtype(torch::kHalf);