kvcache-ai-ktransformers/ktransformers/configs/config.yaml

log:
  dir: "logs"
  file: "lexllama.log"
  #log level: debug, info, warn, error, crit
  level: "debug"
  backup_count: -1

server:
  ip: 0.0.0.0
  port: 10002

db:
  type: "sqllite"
  database: "server.db"
  host: "./"
  pool_size: 10

user:
  secret_key: "981f1dd2a44e27d68759d0252a486568ed43480b4e616a26e3af3709c3a7ce73"
  algorithm: "HS256"

model:
  # type: transformers
  type: balance_serve
  # type: ktransformers

  name: SmallThinkerForCausalLM
  path: /mnt/data/models/Smallthinker-21B
  gguf_path: /mnt/data/models/Smallthinker-21B

  device: cuda:0
  cache_lens: 16384
  max_new_tokens: 500
web:
  mount: False
  open_cross_domain: True

ext:
  cpu_infer: 10

long_context:
  max_seq_len: 32000
  block_size: 128
  local_windows_len: 4096
  second_select_num: 32
  anchor_type: DYNAMIC
  kv_type: FP16
  dense_layer_num: 2
  anchor_num: 1
  preselect_block: True
  head_select_mode: SHARED
  preselect_block_count: 32
  layer_step: 1
  token_step:

local_chat:
  prompt_file: ""

async_server:
  sched_strategy: "FCFS"
  sched_port: 56441
  sched_metrics_port: 54321
  kvc2_metrics_port: 54391
  max_batch_size: 4  # decode count + prefill count, in one mini batch

attn:
  page_size: 256
  chunk_size: 256
kvc2:
  gpu_only: true
  utilization_percentage: 1.0
  cpu_memory_size_GB: 500
  disk_path: /home/wjh/kvc