Skip to content

Commit

Permalink
bugfix: fix multiple definition issue.
Browse files Browse the repository at this point in the history
This related to commit d711c55.
  • Loading branch information
liutongxuan committed Jul 3, 2024
1 parent af84903 commit f1b99f6
Show file tree
Hide file tree
Showing 14 changed files with 172 additions and 172 deletions.
15 changes: 15 additions & 0 deletions scalellm/csrc/vlm_handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,21 @@ namespace py = pybind11;
using namespace pybind11::literals;

void init_vlm_handler(py::module_& m) {
py::enum_<Priority>(m, "Priority")
.value("DEFAULT", Priority::NORMAL)
.value("LOW", Priority::LOW)
.value("NORMAL", Priority::NORMAL)
.value("HIGH", Priority::HIGH)
.export_values();

py::class_<std::future<bool>>(m, "Future")
.def("wait",
&std::future<bool>::wait,
py::call_guard<py::gil_scoped_release>())
.def("get",
&std::future<bool>::get,
py::call_guard<py::gil_scoped_release>());

auto vlm_handler =
py::class_<VLMHandler>(m, "VLMHandler")
.def(py::init<const VLMHandler::Options&>(), py::arg("options"))
Expand Down
38 changes: 21 additions & 17 deletions src/common/metrics.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,16 @@ class AutoCounter final {
// define gauge
// a gauge is a metric that represents a single numerical value that can
// arbitrarily go up and down.
#define DEFINE_GAUGE(name, desc) \
auto& GAUGE_##name = llm::Metrics::Instance().BuildGauge(#name, desc).Add({});
#define DEFINE_GAUGE(name, desc) \
prometheus::Gauge& GAUGE_##name = \
llm::Metrics::Instance().BuildGauge(#name, desc).Add({});

#define DEFINE_GAUGE_FAMILY(name, desc) \
auto& name##_family = llm::Metrics::Instance().BuildGauge(#name, desc);
#define DEFINE_GAUGE_FAMILY(name, desc) \
prometheus::Family<prometheus::Gauge>& name##_family = \
llm::Metrics::Instance().BuildGauge(#name, desc);

#define DEFINE_GAUGE_INSTANCE(alias, name, ...) \
auto& GAUGE_##alias = name##_family.Add(__VA_ARGS__);
prometheus::Gauge& GAUGE_##alias = name##_family.Add(__VA_ARGS__);

#define GAUGE_SET(name, value) GAUGE_##name.Set(value);

Expand All @@ -111,15 +113,16 @@ class AutoCounter final {
// define counter
// a counter is a monotonically increasing counter whose value can only increase
// or be reset to zero on restart.
#define DEFINE_COUNTER(name, desc) \
auto& COUNTER_##name = \
#define DEFINE_COUNTER(name, desc) \
prometheus::Counter& COUNTER_##name = \
llm::Metrics::Instance().BuildCounter(#name, desc).Add({});

#define DEFINE_COUNTER_FAMILY(name, desc) \
auto& name##_family = llm::Metrics::Instance().BuildCounter(#name, desc);
#define DEFINE_COUNTER_FAMILY(name, desc) \
prometheus::Family<prometheus::Counter>& name##_family = \
llm::Metrics::Instance().BuildCounter(#name, desc);

#define DEFINE_COUNTER_INSTANCE(alias, name, ...) \
auto& COUNTER_##alias = name##_family.Add(__VA_ARGS__);
prometheus::Counter& COUNTER_##alias = name##_family.Add(__VA_ARGS__);

#define COUNTER_ADD(name, value) COUNTER_##name.Increment(value);

Expand All @@ -133,16 +136,17 @@ class AutoCounter final {
// a histogram samples observations (usually things like request durations or
// response sizes) and counts them in configurable buckets. It also provides a
// sum of all observed values.
#define DEFINE_HISTOGRAM(name, desc, ...) \
auto& HISTOGRAM_##name = llm::Metrics::Instance() \
.BuildHistogram(#name, desc) \
.Add({}, __VA_ARGS__);
#define DEFINE_HISTOGRAM(name, desc, ...) \
prometheus::Histogram& HISTOGRAM_##name = llm::Metrics::Instance() \
.BuildHistogram(#name, desc) \
.Add({}, __VA_ARGS__);

#define DEFINE_HISTOGRAM_FAMILY(name, desc) \
auto& name##_family = llm::Metrics::Instance().BuildHistogram(#name, desc);
#define DEFINE_HISTOGRAM_FAMILY(name, desc) \
prometheus::Family<prometheus::Histogram>& name##_family = \
llm::Metrics::Instance().BuildHistogram(#name, desc);

#define DEFINE_HISTOGRAM_INSTANCE(alias, name, ...) \
auto& HISTOGRAM_##alias = name##_family.Add(__VA_ARGS__);
prometheus::Histogram& HISTOGRAM_##alias = name##_family.Add(__VA_ARGS__);

#define HISTOGRAM_OBSERVE(name, value) HISTOGRAM_##name.Observe(value);

Expand Down
2 changes: 2 additions & 0 deletions src/engine/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ cc_library(
engine.h
llm_engine.h
vlm_engine.h
engine_metrics.h
SRCS
utils.cpp
batch.cpp
Expand All @@ -22,6 +23,7 @@ cc_library(
vlm_worker.cpp
llm_engine.cpp
vlm_engine.cpp
engine_metrics.cpp
DEPS
torch
:common
Expand Down
15 changes: 15 additions & 0 deletions src/engine/engine_metrics.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#include "engine_metrics.h"

DEFINE_COUNTER(prepare_input_latency_seconds,
"Latency of preparing input in seconds");
DEFINE_COUNTER_FAMILY(execution_latency_seconds,
"Execution latency in seconds");
DEFINE_COUNTER_INSTANCE(model_execution_latency_seconds,
execution_latency_seconds,
{{"stage", "model"}});
DEFINE_COUNTER_INSTANCE(logits_processing_latency_seconds,
execution_latency_seconds,
{{"stage", "logits_processing"}});
DEFINE_COUNTER_INSTANCE(sampling_latency_seconds,
execution_latency_seconds,
{{"stage", "sampling"}});
9 changes: 9 additions & 0 deletions src/engine/engine_metrics.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#pragma once

#include "common/metrics.h"

DECLARE_COUNTER(prepare_input_latency_seconds)
DECLARE_COUNTER_FAMILY(execution_latency_seconds)
DECLARE_COUNTER_INSTANCE(model_execution_latency_seconds)
DECLARE_COUNTER_INSTANCE(logits_processing_latency_seconds)
DECLARE_COUNTER_INSTANCE(sampling_latency_seconds)
4 changes: 1 addition & 3 deletions src/engine/llm_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,12 @@

#include "common/metrics.h"
#include "common/pretty_print.h"
#include "engine_metrics.h"
#include "model_loader/model_loader.h"
#include "model_parallel/parallel_args.h"
#include "models/model_args.h"
#include "worker.h"

DEFINE_COUNTER(prepare_input_latency_seconds,
"Latency of preparing input in seconds");

namespace llm {
namespace {
const std::vector<uint32_t> kDefaultBatchSizesForCudaGraph =
Expand Down
6 changes: 2 additions & 4 deletions src/engine/vlm_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,12 @@

#include "common/metrics.h"
#include "common/pretty_print.h"
#include "engine_metrics.h"
#include "model_loader/model_loader.h"
#include "model_parallel/parallel_args.h"
#include "models/model_args.h"
#include "vlm_worker.h"

// DEFINE_COUNTER(prepare_input_latency_seconds,
// "Latency of preparing input in seconds");

namespace llm {
namespace {
// clang-format off
Expand Down Expand Up @@ -270,7 +268,7 @@ ModelOutput VLMEngine::execute_model(Batch& batch) {
Timer timer;
auto model_inputs = batch.prepare_model_input(options_.num_decoding_tokens(),
adjusted_batch_size);
// COUNTER_ADD(prepare_input_latency_seconds, timer.elapsed_seconds());
COUNTER_ADD(prepare_input_latency_seconds, timer.elapsed_seconds());

if (!model_inputs.token_ids.defined()) {
// empty input, just return
Expand Down
20 changes: 4 additions & 16 deletions src/engine/vlm_worker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "common/metrics.h"
#include "common/threadpool.h"
#include "common/timer.h"
#include "engine_metrics.h"
#include "memory/kv_cache.h"
#include "memory/memory.h"
#include "model_loader/state_dict.h"
Expand All @@ -23,19 +24,6 @@
#include "sampling/logits_processor.h"
#include "sampling/sampler.h"

// latency metrics
// DEFINE_COUNTER_FAMILY(execution_latency_seconds,
// "Execution latency in seconds");
// DEFINE_COUNTER_INSTANCE(model_execution_latency_seconds,
// execution_latency_seconds,
// {{"stage", "model"}});
// DEFINE_COUNTER_INSTANCE(logits_processing_latency_seconds,
// execution_latency_seconds,
// {{"stage", "logits_processing"}});
// DEFINE_COUNTER_INSTANCE(sampling_latency_seconds,
// execution_latency_seconds,
// {{"stage", "sampling"}});

namespace llm {

VLMWorker::VLMWorker(const ParallelArgs& parallel_args,
Expand Down Expand Up @@ -149,7 +137,7 @@ std::optional<ModelOutput> VLMWorker::execute_model(const ModelInput& inputs) {
}

at::cuda::getCurrentCUDAStream().synchronize();
// COUNTER_ADD(model_execution_latency_seconds, timer.elapsed_seconds());
COUNTER_ADD(model_execution_latency_seconds, timer.elapsed_seconds());

if (!driver_) {
return std::nullopt;
Expand All @@ -166,7 +154,7 @@ std::optional<ModelOutput> VLMWorker::execute_model(const ModelInput& inputs) {
sampling_params.unique_token_ids,
sampling_params.unique_token_counts,
sampling_params.unique_token_ids_lens);
// COUNTER_ADD(logits_processing_latency_seconds, timer.elapsed_seconds());
COUNTER_ADD(logits_processing_latency_seconds, timer.elapsed_seconds());

// set logits to output
output.logits = logits;
Expand All @@ -179,7 +167,7 @@ std::optional<ModelOutput> VLMWorker::execute_model(const ModelInput& inputs) {
auto sample_logits =
logits.index_select(/*dim=*/0, sampling_params.sample_idxes);
auto sample_output = sampler->forward(sample_logits);
// COUNTER_ADD(sampling_latency_seconds, timer.elapsed_seconds());
COUNTER_ADD(sampling_latency_seconds, timer.elapsed_seconds());

// set sample output to output
output.sample_output = sample_output;
Expand Down
14 changes: 1 addition & 13 deletions src/engine/worker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "common/metrics.h"
#include "common/threadpool.h"
#include "common/timer.h"
#include "engine_metrics.h"
#include "memory/kv_cache.h"
#include "memory/memory.h"
#include "model_loader/state_dict.h"
Expand All @@ -23,19 +24,6 @@
#include "sampling/logits_processor.h"
#include "sampling/sampler.h"

// latency metrics
DEFINE_COUNTER_FAMILY(execution_latency_seconds,
"Execution latency in seconds");
DEFINE_COUNTER_INSTANCE(model_execution_latency_seconds,
execution_latency_seconds,
{{"stage", "model"}});
DEFINE_COUNTER_INSTANCE(logits_processing_latency_seconds,
execution_latency_seconds,
{{"stage", "logits_processing"}});
DEFINE_COUNTER_INSTANCE(sampling_latency_seconds,
execution_latency_seconds,
{{"stage", "sampling"}});

namespace llm {

Worker::Worker(const ParallelArgs& parallel_args,
Expand Down
2 changes: 2 additions & 0 deletions src/handlers/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@ cc_library(
sampling_params.h
llm_handler.h
vlm_handler.h
handler_metrics.h
SRCS
llm_handler.cpp
vlm_handler.cpp
handler_metrics.cpp
DEPS
:common
:scheduler
Expand Down
42 changes: 42 additions & 0 deletions src/handlers/handler_metrics.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#include "handler_metrics.h"

DEFINE_COUNTER_FAMILY(request_status_total, "Total number of request status");
DEFINE_COUNTER_INSTANCE(request_ok, request_status_total, {{"code", "OK"}});
DEFINE_COUNTER_INSTANCE(request_cancelled,
request_status_total,
{{"code", "CANCELLED"}});
DEFINE_COUNTER_INSTANCE(request_unknown,
request_status_total,
{{"code", "UNKNOWN"}});
DEFINE_COUNTER_INSTANCE(request_invalid_argument,
request_status_total,
{{"code", "INVALID_ARGUMENT"}});
DEFINE_COUNTER_INSTANCE(request_deadline_exceeded,
request_status_total,
{{"code", "DEADLINE_EXCEEDED"}});
DEFINE_COUNTER_INSTANCE(request_resource_exhausted,
request_status_total,
{{"code", "RESOURCE_EXHAUSTED"}});
DEFINE_COUNTER_INSTANCE(request_unauthenticated,
request_status_total,
{{"code", "UNAUTHENTICATED"}});
DEFINE_COUNTER_INSTANCE(request_unavailable,
request_status_total,
{{"code", "UNAVAILABLE"}});
DEFINE_COUNTER_INSTANCE(request_unimplemented,
request_status_total,
{{"code", "UNIMPLEMENTED"}});

DEFINE_COUNTER_FAMILY(request_handling_latency_seconds,
"Request handling latency in seconds");
DEFINE_COUNTER_INSTANCE(chat_handling_latency_seconds,
request_handling_latency_seconds,
{{"type", "chat"}});
DEFINE_COUNTER_INSTANCE(completion_handling_latency_seconds,
request_handling_latency_seconds,
{{"type", "completion"}});

DEFINE_COUNTER(tokenization_latency_seconds,
"Prompt tokenization latency in seconds");
DEFINE_COUNTER(chat_template_latency_seconds,
"Chat template latency in seconds");
19 changes: 19 additions & 0 deletions src/handlers/handler_metrics.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#pragma once

#include "common/metrics.h"

DECLARE_COUNTER_FAMILY(request_status_total)
DECLARE_COUNTER_INSTANCE(request_ok)
DECLARE_COUNTER_INSTANCE(request_cancelled)
DECLARE_COUNTER_INSTANCE(request_unknown)
DECLARE_COUNTER_INSTANCE(request_invalid_argument)
DECLARE_COUNTER_INSTANCE(request_deadline_exceeded)
DECLARE_COUNTER_INSTANCE(request_resource_exhausted)
DECLARE_COUNTER_INSTANCE(request_unauthenticated)
DECLARE_COUNTER_INSTANCE(request_unavailable)
DECLARE_COUNTER_INSTANCE(request_unimplemented)
DECLARE_COUNTER_FAMILY(request_handling_latency_seconds)
DECLARE_COUNTER_INSTANCE(chat_handling_latency_seconds)
DECLARE_COUNTER_INSTANCE(completion_handling_latency_seconds)
DECLARE_COUNTER(tokenization_latency_seconds)
DECLARE_COUNTER(chat_template_latency_seconds)
42 changes: 1 addition & 41 deletions src/handlers/llm_handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,53 +13,13 @@
#include "common/scope_guard.h"
#include "common/timer.h"
#include "engine/utils.h"
#include "handler_metrics.h"
#include "models/model_args.h"
#include "models/model_registry.h"
#include "request/output.h"
#include "request/request.h"
#include "speculative/speculative_engine.h"

DEFINE_COUNTER_FAMILY(request_status_total, "Total number of request status");
DEFINE_COUNTER_INSTANCE(request_ok, request_status_total, {{"code", "OK"}});
DEFINE_COUNTER_INSTANCE(request_cancelled,
request_status_total,
{{"code", "CANCELLED"}});
DEFINE_COUNTER_INSTANCE(request_unknown,
request_status_total,
{{"code", "UNKNOWN"}});
DEFINE_COUNTER_INSTANCE(request_invalid_argument,
request_status_total,
{{"code", "INVALID_ARGUMENT"}});
DEFINE_COUNTER_INSTANCE(request_deadline_exceeded,
request_status_total,
{{"code", "DEADLINE_EXCEEDED"}});
DEFINE_COUNTER_INSTANCE(request_resource_exhausted,
request_status_total,
{{"code", "RESOURCE_EXHAUSTED"}});
DEFINE_COUNTER_INSTANCE(request_unauthenticated,
request_status_total,
{{"code", "UNAUTHENTICATED"}});
DEFINE_COUNTER_INSTANCE(request_unavailable,
request_status_total,
{{"code", "UNAVAILABLE"}});
DEFINE_COUNTER_INSTANCE(request_unimplemented,
request_status_total,
{{"code", "UNIMPLEMENTED"}});

DEFINE_COUNTER_FAMILY(request_handling_latency_seconds,
"Request handling latency in seconds");
DEFINE_COUNTER_INSTANCE(chat_handling_latency_seconds,
request_handling_latency_seconds,
{{"type", "chat"}});
DEFINE_COUNTER_INSTANCE(completion_handling_latency_seconds,
request_handling_latency_seconds,
{{"type", "completion"}});

DEFINE_COUNTER(tokenization_latency_seconds,
"Prompt tokenization latency in seconds");
DEFINE_COUNTER(chat_template_latency_seconds,
"Chat template latency in seconds");

namespace llm {
namespace {

Expand Down
Loading

0 comments on commit f1b99f6

Please sign in to comment.