mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2025-09-09 13:55:27 +00:00
support absorb for prefill long context
This commit is contained in:
parent
e9b1216a9a
commit
f4c198bd42
8 changed files with 93 additions and 33 deletions
|
@ -56,7 +56,7 @@ from ktransformers.models.modeling_deepseek import (
|
|||
from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig
|
||||
from ktransformers.models.configuration_llama import LlamaConfig
|
||||
from ktransformers.operators.base_operator import BaseInjectedModule
|
||||
from ktransformers.util.utils import InferenceState
|
||||
from ktransformers.util.utils import InferenceState, get_compute_capability
|
||||
from ktransformers.util.custom_gguf import GGUFLoader
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from ktransformers.models.modeling_llama import (
|
||||
|
@ -649,7 +649,9 @@ class KDeepseekV2Model(BaseInjectedModule):
|
|||
if per_layer_prefill_flag:
|
||||
causal_mask = None
|
||||
else:
|
||||
if os.name == 'nt':
|
||||
if os.name == 'nt' or get_compute_capability()<8:
|
||||
print("for Windows or GPU before ampere, use forward_windows")
|
||||
# only use mask in forward windows or can't flash attn
|
||||
causal_mask = self._update_causal_mask(
|
||||
attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
|
||||
)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue