@@ -5257,6 +5257,35 @@ int main(int argc, char ** argv) {
5257
5257
res_ok (res, result->to_json ());
5258
5258
};
5259
5259
5260
+ const auto & do_load_model = [&ctx_server, ¶ms, &state]() -> bool {
5261
+ // load the model
5262
+ LOG_INF (" %s: loading model\n " , __func__);
5263
+
5264
+ if (!ctx_server.load_model (params)) {
5265
+ return false ;
5266
+ }
5267
+
5268
+ ctx_server.init ();
5269
+ state.store (SERVER_STATE_READY);
5270
+
5271
+ LOG_INF (" %s: model loaded\n " , __func__);
5272
+ return true ;
5273
+ };
5274
+
5275
+ #ifdef LLAMA_CPP_SYSTEMD_SUPPORT
5276
+ if (params.use_systemd ) {
5277
+ // When using systemd, load the model before starting to accept on the socket.
5278
+ // This prevents a race condition where the client whose connection triggered
5279
+ // this service's start will get 503 errors while the model loads.
5280
+ if (!do_load_model ()) {
5281
+ LOG_ERR (" %s: exiting due to model loading error\n " , __func__);
5282
+ ctx_server.queue_results .terminate ();
5283
+ llama_backend_free ();
5284
+ return 1 ;
5285
+ }
5286
+ }
5287
+ #endif
5288
+
5260
5289
//
5261
5290
// Router
5262
5291
//
@@ -5444,28 +5473,18 @@ int main(int argc, char ** argv) {
5444
5473
} else {
5445
5474
LOG_INF (" %s: HTTP server is listening on systemd socket, http threads: %d\n " , __func__, params.n_threads_http );
5446
5475
}
5447
- #endif
5448
-
5449
- // load the model
5450
- LOG_INF (" %s: loading model\n " , __func__);
5451
-
5452
- if (!ctx_server.load_model (params)) {
5453
- clean_up ();
5454
- t.join ();
5455
- LOG_ERR (" %s: exiting due to model loading error\n " , __func__);
5456
- return 1 ;
5457
- }
5458
-
5459
- ctx_server.init ();
5460
- state.store (SERVER_STATE_READY);
5461
-
5462
- #ifdef LLAMA_CPP_SYSTEMD_SUPPORT
5463
5476
if (params.use_systemd ) {
5464
5477
sd_notify (0 , " READY=1" );
5465
5478
}
5466
5479
#endif
5467
5480
5468
- LOG_INF (" %s: model loaded\n " , __func__);
5481
+ if (state.load () != SERVER_STATE_READY) {
5482
+ if (!do_load_model ()) {
5483
+ clean_up ();
5484
+ t.join ();
5485
+ return 1 ;
5486
+ }
5487
+ }
5469
5488
5470
5489
// print sample chat example to make it clear which template is used
5471
5490
LOG_INF (" %s: chat template, chat_template: %s, example_format: '%s'\n " , __func__,
0 commit comments