Implement multi-batch support for v2, v3, and r1 models with backend_type configured as ktransformers.

This commit is contained in:
jiafei96 2025-07-09 09:09:47 +00:00
parent 890b0f1622
commit a6ab9e349c
6 changed files with 383 additions and 52 deletions

View file

@ -670,6 +670,7 @@ class KLinearMarlin(KLinearBase):
padding_input[:,:self.orin_in_features] = x
x = padding_input
marlin_s = self.marlin_s.to(x.dtype)
x = x.contiguous()
x = KTransformersOps.gptq_marlin_gemm(
x,
self.marlin_q_w,