merge main; Add torch q8 linear

2025-09-10 15:29:39 +00:00 · 2025-03-14 05:52:07 -04:00 · 2025-03-14 05:52:07 -04:00 · ed8437413b
commit ed8437413b
parent 6c4ed59175
27 changed files with 1561 additions and 114 deletions
--- a/ktransformers/local_chat.py
+++ b/ktransformers/local_chat.py
@ -31,6 +31,7 @@ from ktransformers.models.modeling_mixtral import MixtralForCausalLM
 from ktransformers.util.utils import prefill_and_generate, get_compute_capability
 from ktransformers.server.config.config import Config
 from ktransformers.operators.flashinfer_wrapper import flashinfer_enabled
+from ktransformers.util.vendors import device_manager, get_device, to_device, GPUVendor

 custom_models = {
    "DeepseekV2ForCausalLM": DeepseekV2ForCausalLM,
@ -169,7 +170,7 @@ def local_chat(
            assert Config().long_context_config['max_seq_len'] > input_tensor.shape[1] + max_new_tokens, \
            "please change max_seq_len in  ~/.ktransformers/config.yaml"
        
-        if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV3ForCausalLM") and flashinfer_enabled and get_compute_capability() >= 8:
+        if system != "Windows" and (config.architectures[0] == "DeepseekV2ForCausalLM" or config.architectures[0] == "DeepseekV3ForCausalLM") and flashinfer_enabled and get_compute_capability() >= 8 and device_manager.gpu_vendor == GPUVendor.NVIDIA:
            generated = prefill_and_generate(
                model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph, mode = mode, force_think = force_think, chunk_prefill_size = chunk_prefill_size,
                use_flashinfer_mla = True, num_heads = config.num_attention_heads, head_dim_ckv = config.kv_lora_rank, head_dim_kpe = config.qk_rope_head_dim, q_head_dim = config.qk_rope_head_dim + config.qk_nope_head_dim