Skip to content

Commit 2ea6d58

Browse files
committed
When using systemd, load the model before accepting connections.
This avoids a race condition where the client whose connection triggered the service's start would get errors 503 while the model loads.
1 parent 7cca958 commit 2ea6d58

File tree

1 file changed

+36
-17
lines changed

1 file changed

+36
-17
lines changed

tools/server/server.cpp

Lines changed: 36 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5257,6 +5257,35 @@ int main(int argc, char ** argv) {
52575257
res_ok(res, result->to_json());
52585258
};
52595259

5260+
const auto & do_load_model = [&ctx_server, &params, &state]() -> bool {
5261+
// load the model
5262+
LOG_INF("%s: loading model\n", __func__);
5263+
5264+
if (!ctx_server.load_model(params)) {
5265+
return false;
5266+
}
5267+
5268+
ctx_server.init();
5269+
state.store(SERVER_STATE_READY);
5270+
5271+
LOG_INF("%s: model loaded\n", __func__);
5272+
return true;
5273+
};
5274+
5275+
#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
5276+
if (params.use_systemd) {
5277+
// When using systemd, load the model before starting to accept on the socket.
5278+
// This prevents a race condition where the client whose connection triggered
5279+
// this service's start will get 503 errors while the model loads.
5280+
if (!do_load_model()) {
5281+
LOG_ERR("%s: exiting due to model loading error\n", __func__);
5282+
ctx_server.queue_results.terminate();
5283+
llama_backend_free();
5284+
return 1;
5285+
}
5286+
}
5287+
#endif
5288+
52605289
//
52615290
// Router
52625291
//
@@ -5444,28 +5473,18 @@ int main(int argc, char ** argv) {
54445473
} else {
54455474
LOG_INF("%s: HTTP server is listening on systemd socket, http threads: %d\n", __func__, params.n_threads_http);
54465475
}
5447-
#endif
5448-
5449-
// load the model
5450-
LOG_INF("%s: loading model\n", __func__);
5451-
5452-
if (!ctx_server.load_model(params)) {
5453-
clean_up();
5454-
t.join();
5455-
LOG_ERR("%s: exiting due to model loading error\n", __func__);
5456-
return 1;
5457-
}
5458-
5459-
ctx_server.init();
5460-
state.store(SERVER_STATE_READY);
5461-
5462-
#ifdef LLAMA_CPP_SYSTEMD_SUPPORT
54635476
if (params.use_systemd) {
54645477
sd_notify(0, "READY=1");
54655478
}
54665479
#endif
54675480

5468-
LOG_INF("%s: model loaded\n", __func__);
5481+
if (state.load() != SERVER_STATE_READY) {
5482+
if (!do_load_model()) {
5483+
clean_up();
5484+
t.join();
5485+
return 1;
5486+
}
5487+
}
54695488

54705489
// print sample chat example to make it clear which template is used
54715490
LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,

0 commit comments

Comments
 (0)