-
Notifications
You must be signed in to change notification settings - Fork 4.3k
feat(server): enhance server concurrency with task queue, worker threads, and graceful shutdown #3181
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Conversation
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've left a few comments about the some minor nits and I'll go thought and the code more thoroughly, but might not have time until after the weekend.
…asks function in server.cpp
…on_tasks function
… unnecessary std::thread initialization syntax in worker thread setup.
…ability by aligning the initialization syntax.
…adability by adding line breaks for clarity.
…r.cpp for improved readability.
@danbev thank you for the code review. I've addressed all of them. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We cannot merge this - the code is extremely heavy and difficult to understand.
It's better to take smaller steps where we can more easily review and discuss the approach. Here you are trying to achieve too many things at once and I am very doubtful that tings work correctly. Even if they do, it's really hard to follow what is going on.
Thank you for the effort and sorry to reject this change.
// ---- PORTABLE COUNTING SEMAPHORE ---- | ||
class CountingSemaphore { | ||
public: | ||
explicit CountingSemaphore(int initial) : count_(initial) {} | ||
void release() { | ||
std::unique_lock<std::mutex> lock(mtx_); | ||
++count_; | ||
cv_.notify_one(); | ||
} | ||
bool try_acquire() { | ||
std::unique_lock<std::mutex> lock(mtx_); | ||
if (count_ > 0) { | ||
--count_; | ||
return true; | ||
} | ||
return false; | ||
} | ||
private: | ||
std::mutex mtx_; | ||
std::condition_variable cv_; | ||
int count_; | ||
}; | ||
|
||
using namespace httplib; | ||
using json = nlohmann::ordered_json; | ||
|
||
namespace { | ||
// Add debug logging macro for convenience | ||
#define SERVER_DEBUG(msg) \ | ||
do { \ | ||
if (g_debug_mode) \ | ||
std::cerr << "[SERVER_DEBUG] " << msg << std::endl; \ | ||
} while (0) | ||
|
||
// ---- GLOBALS FOR SIGNAL HANDLING ---- | ||
std::atomic<bool> shutdown_requested{false}; | ||
httplib::Server *g_svr_ptr = nullptr; | ||
std::atomic<int> g_active_tasks{0}; | ||
std::unique_ptr<CountingSemaphore> task_queue_slots; | ||
std::atomic<int> active_http_requests{0}; | ||
// -------------------------------------- | ||
|
||
bool g_debug_mode = false; | ||
|
||
// RAII guard for HTTP request counting | ||
struct HttpRequestCounter { | ||
explicit HttpRequestCounter(std::atomic<int> &counter) | ||
: ctr(counter) { | ||
ctr.fetch_add(1, std::memory_order_relaxed); | ||
} | ||
~HttpRequestCounter() { | ||
ctr.fetch_sub(1, std::memory_order_relaxed); | ||
} | ||
std::atomic<int>& ctr; | ||
}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These are too much synchronization primitives, which makes me think that there are likely issues with the synchronization logic. Most of these should not be needed. A simple task queue with a mutex should be enough for the server logic.
struct TranscriptionTask { | ||
int id; | ||
std::shared_ptr<std::vector<float>> audio_data_pcmf32_ptr; | ||
std::shared_ptr<std::vector<std::vector<float>>> audio_data_pcmf32s_ptr; | ||
whisper_params params; | ||
std::promise<json> result_promise; | ||
std::string original_filename; | ||
std::shared_ptr<std::atomic<bool>> connection_alive; | ||
std::shared_ptr<std::atomic<bool>> cancel_flag; | ||
}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
std::shared_ptr
should not be needed here.
std::shared_ptr<std::atomic<bool>>
seems really suspicious.
std::shared_ptr<std::vector<float>> audio_data_pcmf32_ptr; | ||
std::shared_ptr<std::vector<std::vector<float>>> audio_data_pcmf32s_ptr; | ||
whisper_params params; | ||
std::promise<json> result_promise; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Avoid std::future
/std::promise
- this is very complex logic that should be avoided.
std::queue<TranscriptionTask> task_queue; | ||
std::mutex task_queue_mutex; | ||
std::condition_variable task_queue_cv; | ||
std::atomic<int> next_task_id{0}; | ||
|
||
// RAII for whisper_context | ||
using ContextPtr = std::unique_ptr<whisper_context, decltype(&whisper_free)>; | ||
|
||
std::vector<ContextPtr> context_pool; | ||
std::vector<std::thread> worker_threads; | ||
std::mutex contextPoolMutex; | ||
std::mutex contextManagementMutex; | ||
|
||
// RAII thread guard for monitor thread | ||
class ThreadGuard { // Renamed | ||
std::thread t; | ||
public: | ||
explicit ThreadGuard(std::thread&& thr) : t(std::move(thr)) {} | ||
~ThreadGuard() { if (t.joinable()) t.join(); } | ||
ThreadGuard(const ThreadGuard&) = delete; | ||
ThreadGuard& operator=(const ThreadGuard&) = delete; | ||
ThreadGuard(ThreadGuard&& other) noexcept | ||
: t(std::move(other.t)) {} | ||
ThreadGuard& operator=(ThreadGuard&& other) noexcept { | ||
if (t.joinable()) t.join(); | ||
t = std::move(other.t); | ||
return *this; | ||
} | ||
if (req.has_file("split_on_word")) | ||
{ | ||
params.split_on_word = parse_str_to_bool(req.get_file_value("split_on_word").content); | ||
}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Even more synchronization primitives.
Avoid global state.
// Forward declaration for parse_request_data, enqueue_task, and prepare_and_send_response | ||
bool parse_request_data(const Request &req, const server_params &sparams, whisper_params &req_params, bool &is_json_req, std::string &filename, std::shared_ptr<std::vector<float>> &pcmf32_ptr, std::shared_ptr<std::vector<std::vector<float>>> &pcmf32s_ptr, Response &res); | ||
bool enqueue_task(std::atomic<int> &task_id_counter, std::shared_ptr<std::vector<float>> pcmf32_ptr, std::shared_ptr<std::vector<std::vector<float>>> pcmf32s_ptr, const std::string& original_filename, const whisper_params& params, std::shared_ptr<std::atomic<bool>> connection_alive, std::shared_ptr<std::atomic<bool>> cancel_flag, const server_params& sparams, std::shared_future<json>& result_fut); | ||
void prepare_and_send_response(Response &res, const Request &req, std::shared_future<json> fut, std::shared_ptr<std::atomic<bool>> connection_alive, std::shared_ptr<std::atomic<bool>> cancel_flag, int task_id, const server_params& sparams); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is unreadable.
{ | ||
std::lock_guard<std::mutex> lock(task_queue_mutex); // Lock acquired here | ||
// All checks are now performed under this single lock | ||
if (task_queue.empty() && | ||
g_active_tasks.load(std::memory_order_relaxed) == 0 && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Atomic loads inside a locked region - very dubious.
SERVER_DEBUG("Server listen_after_bind() returned false. Initiating shutdown procedure."); | ||
shutdown_requested.store(true, std::memory_order_release); // Ensure flag is set | ||
} else { // server_ran_ok is true, meaning listen_after_bind returned normally without a prior signal. | ||
SERVER_DEBUG("Server listen_after_bind() completed (returned true). Initiating shutdown procedure."); | ||
shutdown_requested.store(true, std::memory_order_release); // Ensure flag is set | ||
} | ||
|
||
// Notify workers that shutdown has begun, in case they are waiting on the CV | ||
task_queue_cv.notify_all(); | ||
SERVER_DEBUG("Waiting for worker threads to process remaining queue and finish active tasks..."); | ||
|
||
// --- NEW DYNAMIC DRAIN LOGIC --- | ||
bool all_tasks_cleared_gracefully = false; | ||
SERVER_DEBUG("Graceful drain loop started. Waiting for up to " << graceful_shutdown_wait_seconds.count() << "s for tasks to clear."); | ||
|
||
// Loop until deadline or all tasks are done | ||
while (std::chrono::steady_clock::now() < graceful_shutdown_deadline) { | ||
bool all_clear_now = false; | ||
{ | ||
std::lock_guard<std::mutex> lock(task_queue_mutex); // Lock acquired here | ||
// All checks are now performed under this single lock | ||
if (task_queue.empty() && | ||
g_active_tasks.load(std::memory_order_relaxed) == 0 && | ||
active_http_requests.load(std::memory_order_relaxed) == 0) { | ||
SERVER_DEBUG("All tasks, queue, and HTTP requests cleared (under lock) before graceful shutdown deadline."); | ||
all_tasks_cleared_gracefully = true; | ||
all_clear_now = true; // Signal to break loop | ||
} | ||
} // Lock released here | ||
|
||
if (all_clear_now) { | ||
break; // Exit loop, all work done | ||
} | ||
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // Poll every 100ms | ||
} | ||
|
||
if (all_tasks_cleared_gracefully) { | ||
SERVER_DEBUG("Graceful drain successful: All tasks and queue cleared before deadline."); | ||
} else { // Deadline reached or loop exited for other reasons (though break is only on success) | ||
SERVER_DEBUG("Graceful shutdown deadline (" << graceful_shutdown_wait_seconds.count() << "s) reached or not all tasks cleared. Forcing abort if any remain."); | ||
} | ||
|
||
return 0; | ||
// Ensure shutdown_requested is true (it should be, but this is a safeguard) | ||
// and notify workers again to ensure they pick up the abort signal if they missed the first one | ||
// or if they were in the middle of a long task. | ||
SERVER_DEBUG("Issuing final shutdown signal to ensure all workers stop."); | ||
shutdown_requested.store(true, std::memory_order_release); | ||
task_queue_cv.notify_all(); | ||
// --- END OF NEW DYNAMIC DRAIN LOGIC --- | ||
|
||
bool any_threads_still_active = false; | ||
for (const auto& worker : worker_threads) { | ||
if (worker.joinable()) { | ||
any_threads_still_active = true; | ||
break; | ||
} | ||
} | ||
if (any_threads_still_active && std::chrono::steady_clock::now() >= graceful_shutdown_deadline) { | ||
SERVER_DEBUG("Graceful shutdown period (" << graceful_shutdown_wait_seconds.count() << "s) ended, some workers still active. Setting hard abort flag (shutdown_requested)."); | ||
shutdown_requested.store(true, std::memory_order_release); | ||
task_queue_cv.notify_all(); | ||
} else if (!any_threads_still_active) { | ||
SERVER_DEBUG("All worker threads appear to have completed their tasks gracefully or were already finished."); | ||
} else { | ||
SERVER_DEBUG("Graceful shutdown period not fully elapsed or all threads finished. Proceeding to final join."); | ||
} | ||
SERVER_DEBUG("Waiting for all worker threads to join..."); | ||
for (size_t i = 0; i < worker_threads.size(); ++i) { | ||
if (worker_threads[i].joinable()) { | ||
worker_threads[i].join(); | ||
SERVER_DEBUG("Worker thread " << i << " joined."); | ||
} | ||
} | ||
worker_threads.clear(); | ||
SERVER_DEBUG("All worker threads joined and cleared."); | ||
SERVER_DEBUG("Freeing Whisper contexts..."); | ||
{ | ||
std::lock_guard<std::mutex> lock(contextManagementMutex); | ||
if (!context_pool.empty() && context_pool[0] && context_pool[0].get() != nullptr) { | ||
whisper_print_timings(context_pool[0].get()); | ||
} | ||
context_pool.clear(); | ||
SERVER_DEBUG("Whisper contexts freed."); | ||
} | ||
int exit_code = 0; | ||
if (shutdown_requested.load(std::memory_order_acquire)) { | ||
SERVER_DEBUG("Graceful shutdown complete."); | ||
exit_code = 0; | ||
} else { | ||
if (!server_ran_ok) { | ||
SERVER_DEBUG("Server loop terminated (listen_after_bind returned false) without a prior signal. This indicates an issue or non-signal stop."); | ||
exit_code = 1; | ||
} else { | ||
SERVER_DEBUG("Server loop completed (listen_after_bind returned true) without shutdown request. Unexpected for blocking server."); | ||
exit_code = 0; | ||
} | ||
} | ||
SERVER_DEBUG("Exiting main with code: " << exit_code); | ||
return exit_code; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
100 lines of shutdown process - this can't be right.
thank you for the review @ggerganov I'll cut it in 3-4 PRs |
Enhancements in this update focus on making the server more robust, efficient, and easier to manage:
These changes collectively make the server more resilient under load, easier to tune for different environments, and safer to operate in production.