mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2025-09-15 09:39:42 +00:00
add balance-serve, support concurrence
This commit is contained in:
parent
8d0292aa44
commit
25cee5810e
196 changed files with 22077 additions and 565 deletions
|
@ -341,7 +341,8 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
|
|||
}, nullptr);
|
||||
}
|
||||
|
||||
void MOE::forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend) {
|
||||
void MOE::forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, int* batch_size_tensor, Backend* backend) {
|
||||
qlen = batch_size_tensor[0];
|
||||
if (qlen < config_.group_min_len) {
|
||||
for (int i = 0; i < qlen; i++) {
|
||||
forward_one(k, expert_ids + i * k, weights + i * k, (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
|
||||
|
@ -350,5 +351,7 @@ void MOE::forward(int qlen, int k, const uint64_t* expert_ids, const float* weig
|
|||
}
|
||||
int forward_len = std::min(config_.group_max_len, qlen);
|
||||
forward_many(forward_len, k, expert_ids, weights, input, output, backend);
|
||||
forward(qlen - forward_len, k, expert_ids + forward_len * k, weights + forward_len * k, (uint8_t*)input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
|
||||
|
||||
batch_size_tensor[0] -= forward_len;
|
||||
forward(qlen - forward_len, k, expert_ids + forward_len * k, weights + forward_len * k, (uint8_t*)input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), batch_size_tensor, backend);
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue