mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2025-09-11 15:54:37 +00:00
smallthink run
This commit is contained in:
parent
590fcb41cd
commit
71c1d4eed7
7 changed files with 123 additions and 32 deletions
|
@ -21,12 +21,12 @@ user:
|
|||
|
||||
model:
|
||||
# type: transformers
|
||||
# type: balance_serve
|
||||
type: ktransformers
|
||||
type: balance_serve
|
||||
# type: ktransformers
|
||||
|
||||
name: DeepSeek-Coder-V2-Instruct
|
||||
path: deepseek-ai/DeepSeek-V2-Lite-Chat
|
||||
gguf_path: ./DeepSeek-V2-Lite-Chat-GGUF
|
||||
name: SmallthinkerForCausalLM
|
||||
path: /mnt/data/models/Smallthinker-21B
|
||||
gguf_path: /mnt/data/models/Smallthinker-21B
|
||||
|
||||
device: cuda:0
|
||||
cache_lens: 16384
|
||||
|
@ -67,7 +67,7 @@ attn:
|
|||
page_size: 256
|
||||
chunk_size: 256
|
||||
kvc2:
|
||||
gpu_only: false
|
||||
gpu_only: true
|
||||
utilization_percentage: 1.0
|
||||
cpu_memory_size_GB: 500
|
||||
disk_path: /mnt/data/kvc
|
||||
disk_path: /home/wjh/kvc
|
|
@ -83,23 +83,6 @@ class KSmallthinkerForCausalLM(SmallthinkerPreTrainedModel):
|
|||
with torch.cuda.stream(current_stream):
|
||||
residual = torch.zeros_like(hidden_states)
|
||||
for i, decode_layer in enumerate(self.model.layers):
|
||||
if self.model.transfer_map is not None and i in self.model.transfer_map:
|
||||
prev_stream = torch.cuda.current_stream()
|
||||
cur_device = self.model.transfer_map[i]
|
||||
if cur_device not in self.model.stream_device_map:
|
||||
self.model.stream_device_map[cur_device] = torch.cuda.Stream(cur_device)
|
||||
torch.cuda.set_device(cur_device)
|
||||
self.model.stream_device_map[cur_device].wait_stream(prev_stream)
|
||||
torch.cuda.set_stream(self.model.stream_device_map[cur_device])
|
||||
hidden_states = hidden_states.to(
|
||||
self.model.transfer_map[i], non_blocking=True
|
||||
)
|
||||
|
||||
batch.minibatch.position_ids = (
|
||||
batch.minibatch.position_ids.to(self.model.transfer_map[i], non_blocking=True)
|
||||
if batch.minibatch.position_ids is not None
|
||||
else None
|
||||
)
|
||||
router_input = hidden_states.clone()
|
||||
hidden_states, residual = decode_layer.input_layernorm(hidden_states, num_tokens_tensors, residual)
|
||||
hidden_states = decode_layer.self_attn(hidden_states, self.cache,
|
||||
|
@ -110,9 +93,9 @@ class KSmallthinkerForCausalLM(SmallthinkerPreTrainedModel):
|
|||
|
||||
hidden_states, residual = decode_layer.post_attention_layernorm(hidden_states, num_tokens_tensors, residual)
|
||||
if not self.config.moe_layer_layout[i]:
|
||||
hidden_states = decode_layer.feed_forward(router_input, hidden_states, num_tokens_tensors)
|
||||
hidden_states = decode_layer.block_sparse_moe(hidden_states, num_tokens_tensors)
|
||||
else:
|
||||
hidden_states = decode_layer.feed_forward(hidden_states, num_tokens_tensors, cuda_graph_idx)
|
||||
hidden_states = decode_layer.block_sparse_moe(router_input, hidden_states, num_tokens_tensors, cuda_graph_idx)
|
||||
# hidden_states = hidden_states.squeeze(0)
|
||||
|
||||
forward_batch_output = ForwardBatchOutput()
|
||||
|
|
|
@ -382,8 +382,7 @@ class Glm4MoeDecoderLayer(GradientCheckpointingLayer):
|
|||
past_key_value=past_key_value,
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
position_embeddings=position_embeddings,
|
||||
**kwargs,
|
||||
position_embeddings=position_embeddings
|
||||
)
|
||||
hidden_states = residual + hidden_states
|
||||
|
||||
|
@ -539,7 +538,6 @@ class Glm4MoeModel(Glm4MoePreTrainedModel):
|
|||
past_key_value=past_key_values,
|
||||
cache_position=cache_position,
|
||||
position_embeddings=position_embeddings,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
hidden_states = self.norm(hidden_states)
|
||||
|
|
|
@ -588,7 +588,7 @@ class KSmallthinkerAttention(BaseInjectedModule, SmallthinkerAttention):
|
|||
print(cos.shape)
|
||||
print(sin.shape)
|
||||
"""
|
||||
if freqs_cis:
|
||||
if freqs_cis is not None:
|
||||
query_states, key_states = self.apply_rotary_pos_emb(query_states.unsqueeze(0), key_states.unsqueeze(0), freqs_cis)
|
||||
|
||||
|
||||
|
|
|
@ -153,7 +153,7 @@ class ArgumentParser:
|
|||
raise ValueError(f"Model {args.model_name} not supported. Please check your model directory or model name.")
|
||||
|
||||
|
||||
if model_config.architectures[0] == "Qwen3MoeForCausalLM" or model_config.architectures[0] == "Qwen2MoeForCausalLM" :
|
||||
if model_config.architectures[0] == "Qwen3MoeForCausalLM" or model_config.architectures[0] == "Qwen2MoeForCausalLM" or model_config.architectures[0] == "SmallthinkerForCausalLM" or model_config.architectures[0] == "Glm4MoeForCausalLM":
|
||||
args.gpu_memory_size = args.cache_lens*2*2*model_config.num_hidden_layers*model_config.num_key_value_heads*model_config.head_dim
|
||||
args.architectures = model_config.architectures[0]
|
||||
else:
|
||||
|
|
|
@ -10,7 +10,7 @@ current_file_path = os.path.abspath(__file__)
|
|||
# sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||||
import pickle
|
||||
import argparse
|
||||
from ktransformers.server.balance_serve.settings import sched_ext, create_sched_settings, create_sched_settings_qwen2moe, create_sched_settings_qwen3moe
|
||||
from ktransformers.server.balance_serve.settings import sched_ext, create_sched_settings, create_sched_settings_qwen2moe, create_sched_settings_qwen3moe, create_sched_settings_glm4moe, create_sched_settings_smallthinker
|
||||
|
||||
|
||||
|
||||
|
@ -213,6 +213,10 @@ if __name__ == '__main__':
|
|||
settings = create_sched_settings_qwen2moe(main_args)
|
||||
elif main_args.architectures == "Qwen3MoeForCausalLM":
|
||||
settings = create_sched_settings_qwen3moe(main_args)
|
||||
elif main_args.architectures == "Glm4MoeForCausalLM":
|
||||
settings = create_sched_settings_glm4moe(main_args)
|
||||
elif main_args.architectures == "SmallthinkerForCausalLM":
|
||||
settings = create_sched_settings_smallthinker(main_args)
|
||||
else:
|
||||
settings = create_sched_settings(main_args)
|
||||
start_server(settings, main_args)
|
||||
|
|
|
@ -12,6 +12,8 @@ import sched_ext
|
|||
from transformers import AutoConfig
|
||||
|
||||
from ktransformers.models.configuration_qwen3_moe import Qwen3MoeConfig
|
||||
from ktransformers.models.configuration_glm4_moe import Glm4MoeConfig
|
||||
from ktransformers.models.configuration_smallthinker import SmallthinkerConfig
|
||||
|
||||
def create_sched_settings(args):
|
||||
default_sample_options = sched_ext.SampleOptions()
|
||||
|
@ -172,6 +174,110 @@ def create_sched_settings_qwen3moe(args):
|
|||
settings.auto_derive()
|
||||
return settings
|
||||
|
||||
def create_sched_settings_glm4moe(args):
|
||||
default_sample_options = sched_ext.SampleOptions()
|
||||
model_name = os.path.basename(os.path.normpath(args.model_dir))
|
||||
input_model_settings = sched_ext.ModelSettings()
|
||||
input_model_settings.model_path = args.model_dir
|
||||
input_model_settings.params_count = int(0)
|
||||
model_config = Glm4MoeConfig.from_pretrained(args.model_dir, trust_remote_code=True)
|
||||
input_model_settings.layer_count = model_config.num_hidden_layers
|
||||
input_model_settings.num_k_heads = model_config.num_key_value_heads # model_config["num_key_value_heads"]
|
||||
input_model_settings.k_head_dim = 128
|
||||
input_model_settings.bytes_per_params = 2
|
||||
input_model_settings.bytes_per_kv_cache_element = 2
|
||||
settings = sched_ext.Settings()
|
||||
settings.model_name = model_name
|
||||
settings.quant_type = "BF16"
|
||||
settings.model_settings = input_model_settings
|
||||
settings.page_size = args.page_size
|
||||
settings.gpu_device_count = 1 # tp
|
||||
settings.gpu_device_id = [i for i in range(settings.gpu_device_count)]
|
||||
# settings.gpu_memory_size = args.cache_lens*576*2
|
||||
settings.gpu_memory_size = args.gpu_memory_size
|
||||
settings.memory_utilization_percentage = args.utilization_percentage
|
||||
max_batch_size = args.max_batch_size
|
||||
chunk_size = args.chunk_size
|
||||
|
||||
max_decode_batch_size = max_batch_size - 2
|
||||
|
||||
settings.max_batch_size = max_batch_size
|
||||
settings.recommended_chunk_prefill_token_count = (chunk_size - max_decode_batch_size) // 2
|
||||
settings.sample_options = default_sample_options
|
||||
settings.sched_metrics_port = args.sched_metrics_port
|
||||
settings.gpu_only = args.memory_gpu_only
|
||||
settings.use_self_defined_head_dim = False
|
||||
settings.self_defined_head_dim = 576
|
||||
settings.full_kv_cache_on_each_gpu = True
|
||||
settings.k_cache_on = True
|
||||
settings.v_cache_on = True
|
||||
|
||||
settings.kvc2_root_path = args.kvc2_disk_path
|
||||
settings.kvc2_config_path = args.kvc2_config_dir
|
||||
settings.memory_pool_size_GB = args.cpu_memory_size_GB
|
||||
settings.evict_count = 40
|
||||
settings.kvc2_metrics_port = args.kvc2_metrics_port
|
||||
settings.load_from_disk = False
|
||||
settings.save_to_disk = True
|
||||
|
||||
|
||||
settings.strategy_name = args.sched_strategy
|
||||
|
||||
settings.auto_derive()
|
||||
return settings
|
||||
|
||||
def create_sched_settings_smallthinker(args):
|
||||
default_sample_options = sched_ext.SampleOptions()
|
||||
model_name = os.path.basename(os.path.normpath(args.model_dir))
|
||||
input_model_settings = sched_ext.ModelSettings()
|
||||
input_model_settings.model_path = args.model_dir
|
||||
input_model_settings.params_count = int(0)
|
||||
model_config = SmallthinkerConfig.from_pretrained(args.model_dir, trust_remote_code=True)
|
||||
input_model_settings.layer_count = model_config.num_hidden_layers
|
||||
input_model_settings.num_k_heads = model_config.num_key_value_heads # model_config["num_key_value_heads"]
|
||||
input_model_settings.k_head_dim = 128
|
||||
input_model_settings.bytes_per_params = 2
|
||||
input_model_settings.bytes_per_kv_cache_element = 2
|
||||
settings = sched_ext.Settings()
|
||||
settings.model_name = model_name
|
||||
settings.quant_type = "BF16"
|
||||
settings.model_settings = input_model_settings
|
||||
settings.page_size = args.page_size
|
||||
settings.gpu_device_count = 1 # tp
|
||||
settings.gpu_device_id = [i for i in range(settings.gpu_device_count)]
|
||||
# settings.gpu_memory_size = args.cache_lens*576*2
|
||||
settings.gpu_memory_size = args.gpu_memory_size
|
||||
settings.memory_utilization_percentage = args.utilization_percentage
|
||||
max_batch_size = args.max_batch_size
|
||||
chunk_size = args.chunk_size
|
||||
|
||||
max_decode_batch_size = max_batch_size - 2
|
||||
|
||||
settings.max_batch_size = max_batch_size
|
||||
settings.recommended_chunk_prefill_token_count = (chunk_size - max_decode_batch_size) // 2
|
||||
settings.sample_options = default_sample_options
|
||||
settings.sched_metrics_port = args.sched_metrics_port
|
||||
settings.gpu_only = args.memory_gpu_only
|
||||
settings.use_self_defined_head_dim = False
|
||||
settings.self_defined_head_dim = 576
|
||||
settings.full_kv_cache_on_each_gpu = True
|
||||
settings.k_cache_on = True
|
||||
settings.v_cache_on = True
|
||||
|
||||
settings.kvc2_root_path = args.kvc2_disk_path
|
||||
settings.kvc2_config_path = args.kvc2_config_dir
|
||||
settings.memory_pool_size_GB = args.cpu_memory_size_GB
|
||||
settings.evict_count = 40
|
||||
settings.kvc2_metrics_port = args.kvc2_metrics_port
|
||||
settings.load_from_disk = False
|
||||
settings.save_to_disk = True
|
||||
|
||||
|
||||
settings.strategy_name = args.sched_strategy
|
||||
|
||||
settings.auto_derive()
|
||||
return settings
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue