smallthinker right

2025-09-12 00:04:39 +00:00 · 2025-07-25 12:46:14 +00:00 · 2025-07-25 12:46:14 +00:00 · 712ad1fa3c
commit 712ad1fa3c
parent f8719ee7b9
7 changed files with 48 additions and 108 deletions
--- a/ktransformers/operators/balance_serve_attention.py
+++ b/ktransformers/operators/balance_serve_attention.py
@ -473,17 +473,31 @@ class KSmallthinkerAttention(BaseInjectedModule, SmallthinkerAttention):
            orig_module.layer_idx)
        self.chunck_size = chunck_size # TODO, generate chunck_size automatically.

-    def apply_rotary_pos_emb(
-        self,
-        xq: torch.Tensor,
-        xk: torch.Tensor,
-        freqs_cis: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
-        xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
-        xq_out = torch.view_as_real(xq_ * freqs_cis[:, :, None, :]).flatten(3)
-        xk_out = torch.view_as_real(xk_ * freqs_cis[:, :, None, :]).flatten(3)
-        return xq_out.type_as(xq), xk_out.type_as(xk)
+    def apply_rotary_pos_emb(self, q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+        """Applies Rotary Position Embedding to the query and key tensors.
+
+        Args:
+            q (`torch.Tensor`): The query tensor.
+            k (`torch.Tensor`): The key tensor.
+            cos (`torch.Tensor`): The cosine part of the rotary embedding.
+            sin (`torch.Tensor`): The sine part of the rotary embedding.
+            position_ids (`torch.Tensor`, *optional*):
+                Deprecated and unused.
+            unsqueeze_dim (`int`, *optional*, defaults to 1):
+                The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+                sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+                that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+                k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+                cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+                the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+        Returns:
+            `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+        """
+        cos = cos.unsqueeze(unsqueeze_dim)
+        sin = sin.unsqueeze(unsqueeze_dim)
+        q_embed = (q * cos) + (rotate_half(q) * sin)
+        k_embed = (k * cos) + (rotate_half(k) * sin)
+        return q_embed, k_embed

    def forward(self,
                hidden_states: torch.Tensor,
@ -514,7 +528,8 @@ class KSmallthinkerAttention(BaseInjectedModule, SmallthinkerAttention):
        print(sin.shape)
        """
        if freqs_cis:  
-            query_states, key_states = self.apply_rotary_pos_emb(query_states.unsqueeze(0), key_states.unsqueeze(0), freqs_cis)
+            cos, sin = freqs_cis
+            query_states, key_states = self.apply_rotary_pos_emb(query_states.unsqueeze(0), key_states.unsqueeze(0), cos, sin, unsqueeze_dim=2)



@ -533,80 +548,7 @@ class KSmallthinkerAttention(BaseInjectedModule, SmallthinkerAttention):

        return attn_output

-class KSmallthinkerAttention(BaseInjectedModule, SmallthinkerAttention):
-    def __init__(self,
-                 key: str,
-                 gguf_loader : GGUFLoader,
-                 config: PretrainedConfig,
-                 orig_module: nn.Module,
-                 prefill_device: str = "cuda",
-                 generate_device: str = "cuda",
-                 chunck_size: int = 1000,
-                 **kwargs):
-        BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, prefill_device, **kwargs)
-        self.orig_module.__init__(orig_module.config,
-            orig_module.layer_idx)
-        self.chunck_size = chunck_size # TODO, generate chunck_size automatically.

-    def apply_rotary_pos_emb(
-        self,
-        xq: torch.Tensor,
-        xk: torch.Tensor,
-        freqs_cis: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
-        xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
-        xq_out = torch.view_as_real(xq_ * freqs_cis[:, :, None, :]).flatten(3)
-        xk_out = torch.view_as_real(xk_ * freqs_cis[:, :, None, :]).flatten(3)
-        return xq_out.type_as(xq), xk_out.type_as(xk)
-
-    def forward(self,
-                hidden_states: torch.Tensor,
-                kv_cache: KGQACache,
-                freqs_cis: torch.Tensor,
-                wrapper: flashInferAttn,
-                bsz_tensors: torch.Tensor,
-                position_ids: torch.Tensor = None,
-                ):
-
-        if self.use_qk_norm:
-            raise NotImplementedError("use_qk_norm is not implemented yet")
-
-        q_len, _ = hidden_states.size()
-        query_states = self.q_proj(hidden_states, bsz_tensors)
-        key_states = self.k_proj(hidden_states, bsz_tensors)
-        value_states = self.v_proj(hidden_states, bsz_tensors)
-
-        query_states = query_states.view(q_len, self.num_attention_heads, self.head_dim)
-        key_states = key_states.view(q_len, self.num_key_value_heads, self.head_dim)
-        value_states = value_states.view(q_len, self.num_key_value_heads, self.head_dim)
-        
-        # cos, sin = freqs_cis
-        """
-        print(query_states.shape)
-        print(key_states.shape)
-        print(cos.shape)
-        print(sin.shape)
-        """
-        if freqs_cis is not None:  
-            query_states, key_states = self.apply_rotary_pos_emb(query_states.unsqueeze(0), key_states.unsqueeze(0), freqs_cis)
-
-
-
-        query_states = query_states.view(q_len, self.num_attention_heads, self.head_dim)
-        key_states = key_states.view(q_len, self.num_key_value_heads, self.head_dim)
-        value_states = value_states.view(q_len, self.num_key_value_heads, self.head_dim)
-
-        k_cache = kv_cache.get_k_cache(self.layer_idx)
-        v_cache = kv_cache.get_v_cache(self.layer_idx)
-
-
-        attn_output = wrapper.forward(query_states, k_cache, v_cache, key_states, value_states)
-  
-
-        attn_output = self.o_proj(attn_output.view(q_len, self.num_attention_heads * self.head_dim), bsz_tensors)
-
-        return attn_output
    

 class KGlm4MoeAttention(BaseInjectedModule, Glm4MoeAttention):