smallthink run

This commit is contained in:
qiyuxinlin 2025-07-24 15:08:29 +00:00
parent 590fcb41cd
commit 71c1d4eed7
7 changed files with 123 additions and 32 deletions

View file

@ -21,12 +21,12 @@ user:
model:
# type: transformers
# type: balance_serve
type: ktransformers
type: balance_serve
# type: ktransformers
name: DeepSeek-Coder-V2-Instruct
path: deepseek-ai/DeepSeek-V2-Lite-Chat
gguf_path: ./DeepSeek-V2-Lite-Chat-GGUF
name: SmallthinkerForCausalLM
path: /mnt/data/models/Smallthinker-21B
gguf_path: /mnt/data/models/Smallthinker-21B
device: cuda:0
cache_lens: 16384
@ -67,7 +67,7 @@ attn:
page_size: 256
chunk_size: 256
kvc2:
gpu_only: false
gpu_only: true
utilization_percentage: 1.0
cpu_memory_size_GB: 500
disk_path: /mnt/data/kvc
disk_path: /home/wjh/kvc

View file

@ -83,23 +83,6 @@ class KSmallthinkerForCausalLM(SmallthinkerPreTrainedModel):
with torch.cuda.stream(current_stream):
residual = torch.zeros_like(hidden_states)
for i, decode_layer in enumerate(self.model.layers):
if self.model.transfer_map is not None and i in self.model.transfer_map:
prev_stream = torch.cuda.current_stream()
cur_device = self.model.transfer_map[i]
if cur_device not in self.model.stream_device_map:
self.model.stream_device_map[cur_device] = torch.cuda.Stream(cur_device)
torch.cuda.set_device(cur_device)
self.model.stream_device_map[cur_device].wait_stream(prev_stream)
torch.cuda.set_stream(self.model.stream_device_map[cur_device])
hidden_states = hidden_states.to(
self.model.transfer_map[i], non_blocking=True
)
batch.minibatch.position_ids = (
batch.minibatch.position_ids.to(self.model.transfer_map[i], non_blocking=True)
if batch.minibatch.position_ids is not None
else None
)
router_input = hidden_states.clone()
hidden_states, residual = decode_layer.input_layernorm(hidden_states, num_tokens_tensors, residual)
hidden_states = decode_layer.self_attn(hidden_states, self.cache,
@ -110,9 +93,9 @@ class KSmallthinkerForCausalLM(SmallthinkerPreTrainedModel):
hidden_states, residual = decode_layer.post_attention_layernorm(hidden_states, num_tokens_tensors, residual)
if not self.config.moe_layer_layout[i]:
hidden_states = decode_layer.feed_forward(router_input, hidden_states, num_tokens_tensors)
hidden_states = decode_layer.block_sparse_moe(hidden_states, num_tokens_tensors)
else:
hidden_states = decode_layer.feed_forward(hidden_states, num_tokens_tensors, cuda_graph_idx)
hidden_states = decode_layer.block_sparse_moe(router_input, hidden_states, num_tokens_tensors, cuda_graph_idx)
# hidden_states = hidden_states.squeeze(0)
forward_batch_output = ForwardBatchOutput()

View file

@ -382,8 +382,7 @@ class Glm4MoeDecoderLayer(GradientCheckpointingLayer):
past_key_value=past_key_value,
use_cache=use_cache,
cache_position=cache_position,
position_embeddings=position_embeddings,
**kwargs,
position_embeddings=position_embeddings
)
hidden_states = residual + hidden_states
@ -539,7 +538,6 @@ class Glm4MoeModel(Glm4MoePreTrainedModel):
past_key_value=past_key_values,
cache_position=cache_position,
position_embeddings=position_embeddings,
**kwargs,
)
hidden_states = self.norm(hidden_states)

View file

@ -588,7 +588,7 @@ class KSmallthinkerAttention(BaseInjectedModule, SmallthinkerAttention):
print(cos.shape)
print(sin.shape)
"""
if freqs_cis:
if freqs_cis is not None:
query_states, key_states = self.apply_rotary_pos_emb(query_states.unsqueeze(0), key_states.unsqueeze(0), freqs_cis)

View file

@ -153,7 +153,7 @@ class ArgumentParser:
raise ValueError(f"Model {args.model_name} not supported. Please check your model directory or model name.")
if model_config.architectures[0] == "Qwen3MoeForCausalLM" or model_config.architectures[0] == "Qwen2MoeForCausalLM" :
if model_config.architectures[0] == "Qwen3MoeForCausalLM" or model_config.architectures[0] == "Qwen2MoeForCausalLM" or model_config.architectures[0] == "SmallthinkerForCausalLM" or model_config.architectures[0] == "Glm4MoeForCausalLM":
args.gpu_memory_size = args.cache_lens*2*2*model_config.num_hidden_layers*model_config.num_key_value_heads*model_config.head_dim
args.architectures = model_config.architectures[0]
else:

View file

@ -10,7 +10,7 @@ current_file_path = os.path.abspath(__file__)
# sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
import pickle
import argparse
from ktransformers.server.balance_serve.settings import sched_ext, create_sched_settings, create_sched_settings_qwen2moe, create_sched_settings_qwen3moe
from ktransformers.server.balance_serve.settings import sched_ext, create_sched_settings, create_sched_settings_qwen2moe, create_sched_settings_qwen3moe, create_sched_settings_glm4moe, create_sched_settings_smallthinker
@ -213,6 +213,10 @@ if __name__ == '__main__':
settings = create_sched_settings_qwen2moe(main_args)
elif main_args.architectures == "Qwen3MoeForCausalLM":
settings = create_sched_settings_qwen3moe(main_args)
elif main_args.architectures == "Glm4MoeForCausalLM":
settings = create_sched_settings_glm4moe(main_args)
elif main_args.architectures == "SmallthinkerForCausalLM":
settings = create_sched_settings_smallthinker(main_args)
else:
settings = create_sched_settings(main_args)
start_server(settings, main_args)

View file

@ -12,6 +12,8 @@ import sched_ext
from transformers import AutoConfig
from ktransformers.models.configuration_qwen3_moe import Qwen3MoeConfig
from ktransformers.models.configuration_glm4_moe import Glm4MoeConfig
from ktransformers.models.configuration_smallthinker import SmallthinkerConfig
def create_sched_settings(args):
default_sample_options = sched_ext.SampleOptions()
@ -172,6 +174,110 @@ def create_sched_settings_qwen3moe(args):
settings.auto_derive()
return settings
def create_sched_settings_glm4moe(args):
default_sample_options = sched_ext.SampleOptions()
model_name = os.path.basename(os.path.normpath(args.model_dir))
input_model_settings = sched_ext.ModelSettings()
input_model_settings.model_path = args.model_dir
input_model_settings.params_count = int(0)
model_config = Glm4MoeConfig.from_pretrained(args.model_dir, trust_remote_code=True)
input_model_settings.layer_count = model_config.num_hidden_layers
input_model_settings.num_k_heads = model_config.num_key_value_heads # model_config["num_key_value_heads"]
input_model_settings.k_head_dim = 128
input_model_settings.bytes_per_params = 2
input_model_settings.bytes_per_kv_cache_element = 2
settings = sched_ext.Settings()
settings.model_name = model_name
settings.quant_type = "BF16"
settings.model_settings = input_model_settings
settings.page_size = args.page_size
settings.gpu_device_count = 1 # tp
settings.gpu_device_id = [i for i in range(settings.gpu_device_count)]
# settings.gpu_memory_size = args.cache_lens*576*2
settings.gpu_memory_size = args.gpu_memory_size
settings.memory_utilization_percentage = args.utilization_percentage
max_batch_size = args.max_batch_size
chunk_size = args.chunk_size
max_decode_batch_size = max_batch_size - 2
settings.max_batch_size = max_batch_size
settings.recommended_chunk_prefill_token_count = (chunk_size - max_decode_batch_size) // 2
settings.sample_options = default_sample_options
settings.sched_metrics_port = args.sched_metrics_port
settings.gpu_only = args.memory_gpu_only
settings.use_self_defined_head_dim = False
settings.self_defined_head_dim = 576
settings.full_kv_cache_on_each_gpu = True
settings.k_cache_on = True
settings.v_cache_on = True
settings.kvc2_root_path = args.kvc2_disk_path
settings.kvc2_config_path = args.kvc2_config_dir
settings.memory_pool_size_GB = args.cpu_memory_size_GB
settings.evict_count = 40
settings.kvc2_metrics_port = args.kvc2_metrics_port
settings.load_from_disk = False
settings.save_to_disk = True
settings.strategy_name = args.sched_strategy
settings.auto_derive()
return settings
def create_sched_settings_smallthinker(args):
default_sample_options = sched_ext.SampleOptions()
model_name = os.path.basename(os.path.normpath(args.model_dir))
input_model_settings = sched_ext.ModelSettings()
input_model_settings.model_path = args.model_dir
input_model_settings.params_count = int(0)
model_config = SmallthinkerConfig.from_pretrained(args.model_dir, trust_remote_code=True)
input_model_settings.layer_count = model_config.num_hidden_layers
input_model_settings.num_k_heads = model_config.num_key_value_heads # model_config["num_key_value_heads"]
input_model_settings.k_head_dim = 128
input_model_settings.bytes_per_params = 2
input_model_settings.bytes_per_kv_cache_element = 2
settings = sched_ext.Settings()
settings.model_name = model_name
settings.quant_type = "BF16"
settings.model_settings = input_model_settings
settings.page_size = args.page_size
settings.gpu_device_count = 1 # tp
settings.gpu_device_id = [i for i in range(settings.gpu_device_count)]
# settings.gpu_memory_size = args.cache_lens*576*2
settings.gpu_memory_size = args.gpu_memory_size
settings.memory_utilization_percentage = args.utilization_percentage
max_batch_size = args.max_batch_size
chunk_size = args.chunk_size
max_decode_batch_size = max_batch_size - 2
settings.max_batch_size = max_batch_size
settings.recommended_chunk_prefill_token_count = (chunk_size - max_decode_batch_size) // 2
settings.sample_options = default_sample_options
settings.sched_metrics_port = args.sched_metrics_port
settings.gpu_only = args.memory_gpu_only
settings.use_self_defined_head_dim = False
settings.self_defined_head_dim = 576
settings.full_kv_cache_on_each_gpu = True
settings.k_cache_on = True
settings.v_cache_on = True
settings.kvc2_root_path = args.kvc2_disk_path
settings.kvc2_config_path = args.kvc2_config_dir
settings.memory_pool_size_GB = args.cpu_memory_size_GB
settings.evict_count = 40
settings.kvc2_metrics_port = args.kvc2_metrics_port
settings.load_from_disk = False
settings.save_to_disk = True
settings.strategy_name = args.sched_strategy
settings.auto_derive()
return settings