diff --git a/src/engine/llm_engine.cpp b/src/engine/llm_engine.cpp index 1e47ab99..c8f9d75c 100644 --- a/src/engine/llm_engine.cpp +++ b/src/engine/llm_engine.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -254,11 +255,32 @@ int64_t LLMEngine::profile_memory_for_kv_cache() { const auto& device = workers_[0]->device(); if (device.is_cpu()) { - // use max memory cache size for CPU - LOG(INFO) << "Initializing CPU cache with max cache size: " - << readable_size(max_cache_size); - // TODO: add CPU memory profiling - return max_cache_size; + // get cpu available memory and total memory + struct sysinfo info; + int err = sysinfo(&info); + if (err != 0) { + LOG(ERROR) << "Initializing CPU cache failure."; + } + int64_t available_memory = info.freeram; + int64_t total_memory = info.totalram; + + int64_t smallest_available_memory = std::numeric_limits::max(); + // apply memory cap from config if it is set + if (max_memory_utilization < 1.0) { + const int64_t buffer_memory = + total_memory * (1.0 - max_memory_utilization); + available_memory -= buffer_memory; + } + if (max_cache_size > 0) { + available_memory = std::min(available_memory, max_cache_size); + } + smallest_available_memory = + std::min(smallest_available_memory, available_memory); + + auto cache_size = std::max(smallest_available_memory, int64_t(0)); + LOG(INFO) << "Initializing CPU cache with cache size: " + << readable_size(cache_size); + return cache_size; } CHECK(device.is_cuda()) << "Only support CPU and CUDA device for now.";