diff --git a/common/common.h b/common/common.h index 8a0e5eed5..9855d3f36 100644 --- a/common/common.h +++ b/common/common.h @@ -587,7 +587,7 @@ struct common_params { // server params int32_t port = 8080; // server listens on this network port bool reuse_port = false; // allow multiple sockets to bind to the same port - int32_t timeout_read = 600; // http read timeout in seconds + int32_t timeout_read = 3600; // http read timeout in seconds int32_t timeout_write = timeout_read; // http write timeout in seconds int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting diff --git a/tools/server/server-queue.cpp b/tools/server/server-queue.cpp index d5fceb1b1..588e1a82b 100644 --- a/tools/server/server-queue.cpp +++ b/tools/server/server-queue.cpp @@ -381,8 +381,10 @@ server_task_result_ptr server_response_reader::next(const std::function if (result == nullptr) { // timeout, check stop condition if (should_stop()) { - SRV_WRN("%s", "stopping wait for next result due to should_stop condition (adjust the --timeout argument if needed)\n"); - SRV_WRN("%s", "ref: https://github.com/ggml-org/llama.cpp/pull/22907\n"); + const int64_t time_elapsed_ms = ggml_time_ms() - time_start_ms; + if (time_elapsed_ms > 30000) { + SRV_WRN("%s", "request cancelled after 30s, potentially a client-side timeout; please check your client's code\n"); + } return nullptr; } } else { diff --git a/tools/server/server-queue.h b/tools/server/server-queue.h index 35f010401..8ce32c69f 100644 --- a/tools/server/server-queue.h +++ b/tools/server/server-queue.h @@ -169,6 +169,8 @@ struct server_response_reader { bool cancelled = false; int polling_interval_seconds; + const int64_t time_start_ms = ggml_time_ms(); + // tracking generation state and partial tool calls // only used by streaming completions std::vector states;