add balance-serve, support concurrence

This commit is contained in:
Atream 2025-03-31 22:55:32 +08:00
parent 8d0292aa44
commit 25cee5810e
196 changed files with 22077 additions and 565 deletions

View file

@ -1,6 +1,6 @@
import argparse
from ktransformers.server.backend.args import ConfigArgs, default_args
from ktransformers.util.utils import get_free_ports
class ArgumentParser:
def __init__(self, cfg):
@ -16,20 +16,18 @@ class ArgumentParser:
parser.add_argument("--web", type=bool, default=self.cfg.mount_web)
parser.add_argument("--model_name", type=str, default=self.cfg.model_name)
parser.add_argument("--model_dir", type=str)
parser.add_argument("--model_path", type=str)
parser.add_argument("--model_path", type=str, default=self.cfg.model_path)
parser.add_argument(
"--device", type=str, default=self.cfg.model_device, help="Warning: Abandoning this parameter"
)
parser.add_argument("--gguf_path", type=str, default=self.cfg.gguf_path)
parser.add_argument("--optimize_config_path", default=self.cfg.optimize_config_path, type=str, required=False)
parser.add_argument("--optimize_config_path", default=None, type=str, required=False)
parser.add_argument("--cpu_infer", type=int, default=self.cfg.cpu_infer)
parser.add_argument("--type", type=str, default=self.cfg.backend_type)
parser.add_argument("--chunk_prefill_size", type=int, default=8192)
parser.add_argument("--backend_type", type=str, default=self.cfg.backend_type)
parser.add_argument("--chunk_size", type=int, default=self.cfg.chunk_size)
# model configs
# parser.add_argument("--model_cache_lens", type=int, default=self.cfg.cache_lens) # int?
parser.add_argument("--paged", type=bool, default=self.cfg.paged)
parser.add_argument("--total_context", type=int, default=self.cfg.total_context)
parser.add_argument("--max_batch_size", type=int, default=self.cfg.max_batch_size)
parser.add_argument("--max_new_tokens", type=int, default=self.cfg.max_new_tokens)
parser.add_argument("--json_mode", type=bool, default=self.cfg.json_mode)
@ -62,7 +60,6 @@ class ArgumentParser:
parser.add_argument("--repetition_penalty", type=float, default=self.cfg.repetition_penalty)
parser.add_argument("--frequency_penalty", type=float, default=self.cfg.frequency_penalty)
parser.add_argument("--presence_penalty", type=float, default=self.cfg.presence_penalty)
parser.add_argument("--max_response_tokens", type=int, default=self.cfg.max_response_tokens)
parser.add_argument("--response_chunk", type=int, default=self.cfg.response_chunk)
parser.add_argument("--no_code_formatting", type=bool, default=self.cfg.no_code_formatting)
parser.add_argument("--cache_8bit", type=bool, default=self.cfg.cache_8bit)
@ -103,6 +100,18 @@ class ArgumentParser:
# local chat
parser.add_argument("--prompt_file", type=str, default=self.cfg.prompt_file)
# async server
parser.add_argument("--sched_strategy", type=str, default=self.cfg.sched_strategy)
# parser.add_argument("--sched_port", type=int, default=self.cfg.sched_port)
# parser.add_argument("--sched_metrics_port", type=int, default=self.cfg.sched_metrics_port)
# parser.add_argument("--kvc2_metrics_port", type=int, default=self.cfg.kvc2_metrics_port)
parser.add_argument("--page_size", type=str, default=self.cfg.page_size)
parser.add_argument("--memory_gpu_only", type=str, default=self.cfg.memory_gpu_only)
parser.add_argument("--utilization_percentage", type=str, default=self.cfg.utilization_percentage)
parser.add_argument("--cpu_memory_size_GB", type=str, default=self.cfg.cpu_memory_size_GB)
args = parser.parse_args()
if (args.model_dir is not None or args.model_path is not None):
if (args.model_path is not None):
@ -123,6 +132,15 @@ class ArgumentParser:
self.cfg.mount_web = args.web
self.cfg.server_ip = args.host
self.cfg.server_port = args.port
self.cfg.backend_type = args.type
self.cfg.user_force_think = args.force_think
args.gpu_memory_size = args.cache_lens*2*576*61
self.cfg.gpu_memory_size = args.gpu_memory_size
free_ports = get_free_ports(3, [args.port])
args.sched_port = free_ports[0]
args.sched_metrics_port = free_ports[1]
args.kvc2_metrics_port = free_ports[2]
self.cfg.sched_port = free_ports[0]
self.cfg.sched_metrics_port = free_ports[1]
self.cfg.kvc2_metrics_port = free_ports[2]
return args