kvcache-ai-ktransformers/en/SFT/injection_tutorial.html
2025-12-16 07:20:29 +00:00

551 lines
42 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!DOCTYPE HTML>
<html lang="zh-CN" class="light sidebar-visible" dir="ltr">
<head>
<!-- Book generated using mdBook -->
<meta charset="UTF-8">
<title>Injection Tutorial - Ktransformers</title>
<!-- Custom HTML head -->
<meta name="description" content="">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="theme-color" content="#ffffff">
<link rel="icon" href="../../favicon-de23e50b.svg">
<link rel="shortcut icon" href="../../favicon-8114d1fc.png">
<link rel="stylesheet" href="../../css/variables-8adf115d.css">
<link rel="stylesheet" href="../../css/general-2459343d.css">
<link rel="stylesheet" href="../../css/chrome-ae938929.css">
<link rel="stylesheet" href="../../css/print-9e4910d8.css" media="print">
<!-- Fonts -->
<link rel="stylesheet" href="../../fonts/fonts-9644e21d.css">
<!-- Highlight.js Stylesheets -->
<link rel="stylesheet" id="mdbook-highlight-css" href="../../highlight-493f70e1.css">
<link rel="stylesheet" id="mdbook-tomorrow-night-css" href="../../tomorrow-night-4c0ae647.css">
<link rel="stylesheet" id="mdbook-ayu-highlight-css" href="../../ayu-highlight-3fdfc3ac.css">
<!-- Custom theme stylesheets -->
<!-- Provide site root and default themes to javascript -->
<script>
const path_to_root = "../../";
const default_light_theme = "light";
const default_dark_theme = "navy";
window.path_to_searchindex_js = "../../searchindex-60c07ede.js";
</script>
<!-- Start loading toc.js asap -->
<script src="../../toc-78fe5878.js"></script>
</head>
<body>
<div id="mdbook-help-container">
<div id="mdbook-help-popup">
<h2 class="mdbook-help-title">Keyboard shortcuts</h2>
<div>
<p>Press <kbd></kbd> or <kbd></kbd> to navigate between chapters</p>
<p>Press <kbd>S</kbd> or <kbd>/</kbd> to search in the book</p>
<p>Press <kbd>?</kbd> to show this help</p>
<p>Press <kbd>Esc</kbd> to hide this help</p>
</div>
</div>
</div>
<div id="mdbook-body-container">
<!-- Work around some values being stored in localStorage wrapped in quotes -->
<script>
try {
let theme = localStorage.getItem('mdbook-theme');
let sidebar = localStorage.getItem('mdbook-sidebar');
if (theme.startsWith('"') && theme.endsWith('"')) {
localStorage.setItem('mdbook-theme', theme.slice(1, theme.length - 1));
}
if (sidebar.startsWith('"') && sidebar.endsWith('"')) {
localStorage.setItem('mdbook-sidebar', sidebar.slice(1, sidebar.length - 1));
}
} catch (e) { }
</script>
<!-- Set the theme before any content is loaded, prevents flash -->
<script>
const default_theme = window.matchMedia("(prefers-color-scheme: dark)").matches ? default_dark_theme : default_light_theme;
let theme;
try { theme = localStorage.getItem('mdbook-theme'); } catch(e) { }
if (theme === null || theme === undefined) { theme = default_theme; }
const html = document.documentElement;
html.classList.remove('light')
html.classList.add(theme);
html.classList.add("js");
</script>
<input type="checkbox" id="mdbook-sidebar-toggle-anchor" class="hidden">
<!-- Hide / unhide sidebar before it is displayed -->
<script>
let sidebar = null;
const sidebar_toggle = document.getElementById("mdbook-sidebar-toggle-anchor");
if (document.body.clientWidth >= 1080) {
try { sidebar = localStorage.getItem('mdbook-sidebar'); } catch(e) { }
sidebar = sidebar || 'visible';
} else {
sidebar = 'hidden';
sidebar_toggle.checked = false;
}
if (sidebar === 'visible') {
sidebar_toggle.checked = true;
} else {
html.classList.remove('sidebar-visible');
}
</script>
<nav id="mdbook-sidebar" class="sidebar" aria-label="Table of contents">
<!-- populated by js -->
<mdbook-sidebar-scrollbox class="sidebar-scrollbox"></mdbook-sidebar-scrollbox>
<noscript>
<iframe class="sidebar-iframe-outer" src="../../toc.html"></iframe>
</noscript>
<div id="mdbook-sidebar-resize-handle" class="sidebar-resize-handle">
<div class="sidebar-resize-indicator"></div>
</div>
</nav>
<div id="mdbook-page-wrapper" class="page-wrapper">
<div class="page">
<div id="mdbook-menu-bar-hover-placeholder"></div>
<div id="mdbook-menu-bar" class="menu-bar sticky">
<div class="left-buttons">
<label id="mdbook-sidebar-toggle" class="icon-button" for="mdbook-sidebar-toggle-anchor" title="Toggle Table of Contents" aria-label="Toggle Table of Contents" aria-controls="mdbook-sidebar">
<span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M0 96C0 78.3 14.3 64 32 64H416c17.7 0 32 14.3 32 32s-14.3 32-32 32H32C14.3 128 0 113.7 0 96zM0 256c0-17.7 14.3-32 32-32H416c17.7 0 32 14.3 32 32s-14.3 32-32 32H32c-17.7 0-32-14.3-32-32zM448 416c0 17.7-14.3 32-32 32H32c-17.7 0-32-14.3-32-32s14.3-32 32-32H416c17.7 0 32 14.3 32 32z"/></svg></span>
</label>
<button id="mdbook-theme-toggle" class="icon-button" type="button" title="Change theme" aria-label="Change theme" aria-haspopup="true" aria-expanded="false" aria-controls="mdbook-theme-list">
<span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 576 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M371.3 367.1c27.3-3.9 51.9-19.4 67.2-42.9L600.2 74.1c12.6-19.5 9.4-45.3-7.6-61.2S549.7-4.4 531.1 9.6L294.4 187.2c-24 18-38.2 46.1-38.4 76.1L371.3 367.1zm-19.6 25.4l-116-104.4C175.9 290.3 128 339.6 128 400c0 3.9 .2 7.8 .6 11.6c1.8 17.5-10.2 36.4-27.8 36.4H96c-17.7 0-32 14.3-32 32s14.3 32 32 32H240c61.9 0 112-50.1 112-112c0-2.5-.1-5-.2-7.5z"/></svg></span>
</button>
<ul id="mdbook-theme-list" class="theme-popup" aria-label="Themes" role="menu">
<li role="none"><button role="menuitem" class="theme" id="mdbook-theme-default_theme">Auto</button></li>
<li role="none"><button role="menuitem" class="theme" id="mdbook-theme-light">Light</button></li>
<li role="none"><button role="menuitem" class="theme" id="mdbook-theme-rust">Rust</button></li>
<li role="none"><button role="menuitem" class="theme" id="mdbook-theme-coal">Coal</button></li>
<li role="none"><button role="menuitem" class="theme" id="mdbook-theme-navy">Navy</button></li>
<li role="none"><button role="menuitem" class="theme" id="mdbook-theme-ayu">Ayu</button></li>
</ul>
<button id="mdbook-search-toggle" class="icon-button" type="button" title="Search (`/`)" aria-label="Toggle Searchbar" aria-expanded="false" aria-keyshortcuts="/ s" aria-controls="mdbook-searchbar">
<span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M416 208c0 45.9-14.9 88.3-40 122.7L502.6 457.4c12.5 12.5 12.5 32.8 0 45.3s-32.8 12.5-45.3 0L330.7 376c-34.4 25.2-76.8 40-122.7 40C93.1 416 0 322.9 0 208S93.1 0 208 0S416 93.1 416 208zM208 352c79.5 0 144-64.5 144-144s-64.5-144-144-144S64 128.5 64 208s64.5 144 144 144z"/></svg></span>
</button>
</div>
<h1 class="menu-title">Ktransformers</h1>
<div class="right-buttons">
<a href="../../print.html" title="Print this book" aria-label="Print this book">
<span class=fa-svg id="print-button"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M128 0C92.7 0 64 28.7 64 64v96h64V64H354.7L384 93.3V160h64V93.3c0-17-6.7-33.3-18.7-45.3L400 18.7C388 6.7 371.7 0 354.7 0H128zM384 352v32 64H128V384 368 352H384zm64 32h32c17.7 0 32-14.3 32-32V256c0-35.3-28.7-64-64-64H64c-35.3 0-64 28.7-64 64v96c0 17.7 14.3 32 32 32H64v64c0 35.3 28.7 64 64 64H384c35.3 0 64-28.7 64-64V384zm-16-88c-13.3 0-24-10.7-24-24s10.7-24 24-24s24 10.7 24 24s-10.7 24-24 24z"/></svg></span>
</a>
<a href="https://github.com/kvcache-ai/ktransformers" title="Git repository" aria-label="Git repository">
<span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 496 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M165.9 397.4c0 2-2.3 3.6-5.2 3.6-3.3.3-5.6-1.3-5.6-3.6 0-2 2.3-3.6 5.2-3.6 3-.3 5.6 1.3 5.6 3.6zm-31.1-4.5c-.7 2 1.3 4.3 4.3 4.9 2.6 1 5.6 0 6.2-2s-1.3-4.3-4.3-5.2c-2.6-.7-5.5.3-6.2 2.3zm44.2-1.7c-2.9.7-4.9 2.6-4.6 4.9.3 2 2.9 3.3 5.9 2.6 2.9-.7 4.9-2.6 4.6-4.6-.3-1.9-3-3.2-5.9-2.9zM244.8 8C106.1 8 0 113.3 0 252c0 110.9 69.8 205.8 169.5 239.2 12.8 2.3 17.3-5.6 17.3-12.1 0-6.2-.3-40.4-.3-61.4 0 0-70 15-84.7-29.8 0 0-11.4-29.1-27.8-36.6 0 0-22.9-15.7 1.6-15.4 0 0 24.9 2 38.6 25.8 21.9 38.6 58.6 27.5 72.9 20.9 2.3-16 8.8-27.1 16-33.7-55.9-6.2-112.3-14.3-112.3-110.5 0-27.5 7.6-41.3 23.6-58.9-2.6-6.5-11.1-33.3 2.6-67.9 20.9-6.5 69 27 69 27 20-5.6 41.5-8.5 62.8-8.5s42.8 2.9 62.8 8.5c0 0 48.1-33.6 69-27 13.7 34.7 5.2 61.4 2.6 67.9 16 17.7 25.8 31.5 25.8 58.9 0 96.5-58.9 104.2-114.8 110.5 9.2 7.9 17 22.9 17 46.4 0 33.7-.3 75.4-.3 83.6 0 6.5 4.6 14.4 17.3 12.1C428.2 457.8 496 362.9 496 252 496 113.3 383.5 8 244.8 8zM97.2 352.9c-1.3 1-1 3.3.7 5.2 1.6 1.6 3.9 2.3 5.2 1 1.3-1 1-3.3-.7-5.2-1.6-1.6-3.9-2.3-5.2-1zm-10.8-8.1c-.7 1.3.3 2.9 2.3 3.9 1.6 1 3.6.7 4.3-.7.7-1.3-.3-2.9-2.3-3.9-2-.6-3.6-.3-4.3.7zm32.4 35.6c-1.6 1.3-1 4.3 1.3 6.2 2.3 2.3 5.2 2.6 6.5 1 1.3-1.3.7-4.3-1.3-6.2-2.2-2.3-5.2-2.6-6.5-1zm-11.4-14.7c-1.6 1-1.6 3.6 0 5.9 1.6 2.3 4.3 3.3 5.6 2.3 1.6-1.3 1.6-3.9 0-6.2-1.4-2.3-4-3.3-5.6-2z"/></svg></span>
</a>
<a href="https://github.com/kvcache-ai/ktransformers/edit/main/doc/en/SFT/injection_tutorial.md" title="Suggest an edit" aria-label="Suggest an edit" rel="edit">
<span class=fa-svg id="git-edit-button"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M421.7 220.3l-11.3 11.3-22.6 22.6-205 205c-6.6 6.6-14.8 11.5-23.8 14.1L30.8 511c-8.4 2.5-17.5 .2-23.7-6.1S-1.5 489.7 1 481.2L38.7 353.1c2.6-9 7.5-17.2 14.1-23.8l205-205 22.6-22.6 11.3-11.3 33.9 33.9 62.1 62.1 33.9 33.9zM96 353.9l-9.3 9.3c-.9 .9-1.6 2.1-2 3.4l-25.3 86 86-25.3c1.3-.4 2.5-1.1 3.4-2l9.3-9.3H112c-8.8 0-16-7.2-16-16V353.9zM453.3 19.3l39.4 39.4c25 25 25 65.5 0 90.5l-14.5 14.5-22.6 22.6-11.3 11.3-33.9-33.9-62.1-62.1L314.3 67.7l11.3-11.3 22.6-22.6 14.5-14.5c25-25 65.5-25 90.5 0z"/></svg></span>
</a>
</div>
</div>
<div id="mdbook-search-wrapper" class="hidden">
<form id="mdbook-searchbar-outer" class="searchbar-outer">
<div class="search-wrapper">
<input type="search" id="mdbook-searchbar" name="searchbar" placeholder="Search this book ..." aria-controls="mdbook-searchresults-outer" aria-describedby="searchresults-header">
<div class="spinner-wrapper">
<span class=fa-svg id="fa-spin"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M304 48c0-26.5-21.5-48-48-48s-48 21.5-48 48s21.5 48 48 48s48-21.5 48-48zm0 416c0-26.5-21.5-48-48-48s-48 21.5-48 48s21.5 48 48 48s48-21.5 48-48zM48 304c26.5 0 48-21.5 48-48s-21.5-48-48-48s-48 21.5-48 48s21.5 48 48 48zm464-48c0-26.5-21.5-48-48-48s-48 21.5-48 48s21.5 48 48 48s48-21.5 48-48zM142.9 437c18.7-18.7 18.7-49.1 0-67.9s-49.1-18.7-67.9 0s-18.7 49.1 0 67.9s49.1 18.7 67.9 0zm0-294.2c18.7-18.7 18.7-49.1 0-67.9S93.7 56.2 75 75s-18.7 49.1 0 67.9s49.1 18.7 67.9 0zM369.1 437c18.7 18.7 49.1 18.7 67.9 0s18.7-49.1 0-67.9s-49.1-18.7-67.9 0s-18.7 49.1 0 67.9z"/></svg></span>
</div>
</div>
</form>
<div id="mdbook-searchresults-outer" class="searchresults-outer hidden">
<div id="mdbook-searchresults-header" class="searchresults-header"></div>
<ul id="mdbook-searchresults">
</ul>
</div>
</div>
<!-- Apply ARIA attributes after the sidebar and the sidebar toggle button are added to the DOM -->
<script>
document.getElementById('mdbook-sidebar-toggle').setAttribute('aria-expanded', sidebar === 'visible');
document.getElementById('mdbook-sidebar').setAttribute('aria-hidden', sidebar !== 'visible');
Array.from(document.querySelectorAll('#mdbook-sidebar a')).forEach(function(link) {
link.setAttribute('tabIndex', sidebar === 'visible' ? 0 : -1);
});
</script>
<div id="mdbook-content" class="content">
<main>
<h1 id="tutorial-inject-operator-step-by-step"><a class="header" href="#tutorial-inject-operator-step-by-step">Tutorial: Inject Operator Step by Step</a></h1>
<blockquote>
<p>Author: Azure-Tang</p>
</blockquote>
<h2 id="tldr"><a class="header" href="#tldr">TL;DR</a></h2>
<p>This tutorial will guide you through the process of injecting custom operators into a model using the KTransformers framework. We will use the DeepSeekV2-Chat model as an example to demonstrate how to inject custom operators into the model step by step. The tutorial will cover the following topics:</p>
<ul>
<li><a href="#tldr">TL;DR</a></li>
<li><a href="#how-to-write-injection-rules">How to Write Injection Rules</a></li>
<li><a href="#understanding-model-structure">Understanding Model Structure</a></li>
<li><a href="#matrix-absorption-based-mla-injection">Matrix Absorption-based MLA Injection</a></li>
<li><a href="#injection-of-routed-experts">Injection of Routed Experts</a></li>
<li><a href="#injection-of-linear-layers">Injection of Linear Layers</a></li>
<li><a href="#injection-of-modules-with-pre-calculated-buffers">Injection of Modules with Pre-calculated Buffers</a></li>
<li><a href="#specifying-running-devices-for-modules">Specifying Running Devices for Modules</a></li>
<li><a href="#muti-gpu">Muti-GPU</a></li>
<li><a href="#how-to-write-a-new-operator-and-inject-into-the-model">How to Write a New Operator and Inject into the Model</a></li>
</ul>
<h2 id="how-to-write-injection-rules"><a class="header" href="#how-to-write-injection-rules">How to Write Injection Rules</a></h2>
<p>The basic form of the injection rules for the Inject framework is as follows:</p>
<pre><code class="language-yaml">- match:
name: "^model\\.layers\\..*\\.*$" # Target module name
class: torch.nn.Linear # Target module
replace:
class: "default"
kwargs:
generate_device: "cuda:0"
# your_op_param_1: 1234
# your_op_param_2: 5678
recursive: True
</code></pre>
<ul>
<li>match: This field marks the matching rules, which can appear in two forms, name and class. These two matching rules can appear together or separately; they only match when both criteria are met.</li>
<li>replace:
<ul>
<li>class: Python class that can be imported to replace the target module. If no replacement is desired, set to default.</li>
<li>kwargs: List of parameters needed for module initialization.
<ul>
<li>generate_device: The device for this module, can be set to “cpu”, “cuda”, “cuda:1”, etc.</li>
</ul>
</li>
</ul>
</li>
<li>recursive: Whether to recursively inject this modules submodules, default is True.</li>
</ul>
<p>For the recursive field: Some modules contain multiple submodules, such as the Self-attention module typically includes q/k/v/o four linear modules. If we replace the self-attention module but do not want the internal linear modules to be covered by other rules, set this rule to False.</p>
<h2 id="understanding-model-structure"><a class="header" href="#understanding-model-structure">Understanding Model Structure</a></h2>
<p>Using <a href="https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat">deepseek-ai/DeepSeek-V2-Lite-Chat</a> as an example, we can follow the above rules step by step to inject our custom module and run it. KTransformers offers a high degree of flexibility, allowing you to replace/experiment with basic operators. However, it also requires users to clearly understand the structure of the model they are running.</p>
<p>Fortunately, knowing the structure of a model is very simple. Open the file list on the <a href="https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/tree/main">deepseek-ai/DeepSeek-V2-Lite</a> homepage, and you can see the following files:</p>
<p align="center">
<picture>
<img alt="Inject-Struction" src="../../assets/model_structure_guild.png" width="60%">
</picture>
</p>
<p>From the <code>.saftensors</code> file, we can see the name of each layers weights, corresponding to the match.name attribute in the injection rules.
From the <code>modeling_deepseek.py</code> file, we can see the specific implementation of each module class, with the class name corresponding to the match.class attribute in the injection rules.</p>
<p>The structure of the DeepSeekV2 model from the <code>.saftensors</code> and <code>modeling_deepseek.py</code> files is as follows:</p>
<p align="center">
<picture>
<img alt="Inject-Struction" src="../../assets/deepseekv2_structure.png" width="60%">
</picture>
</p>
<p>Supported operators and their corresponding classes are as follows:</p>
<div class="table-wrapper">
<table>
<thead>
<tr><th>match</th><th>replace</th><th>backends</th><th>descriptions</th></tr>
</thead>
<tbody>
<tr><td>Linear</td><td>KTransformersLinear</td><td>KLinearMarlin</td><td>Marlin as backend</td></tr>
<tr><td></td><td></td><td>KLinearTorch</td><td>pytorch as backend</td></tr>
<tr><td></td><td></td><td>KLinearCPUInfer</td><td>llamafile as backend</td></tr>
<tr><td></td><td></td><td>KLinearFP8</td><td>Triton fp8_gemm kernel. Requires GPU be able to caluculate fp8 data</td></tr>
<tr><td>experts</td><td>KTransformersExperts</td><td>KExpertsTorch</td><td>pytorch as backend</td></tr>
<tr><td></td><td></td><td>KExpertsMarlin</td><td>Marlin as backend</td></tr>
<tr><td></td><td></td><td>KExpertsCPU</td><td>llamafile as backend</td></tr>
<tr><td>Attention</td><td>KDeepseekV2Attention</td><td>KDeepseekV2Attention</td><td>MLA implementation</td></tr>
<tr><td>MoE</td><td>KMistralSparseMoEBlock</td><td>KQwen2MoeSparseMoeBlock</td><td>MoE for Qwen2</td></tr>
<tr><td></td><td>KDeepseekV2MoE</td><td>KDeepseekV2MoE</td><td>MoE for DeepseekV2</td></tr>
<tr><td>Model</td><td>KQwen2MoeModel</td><td>KQwen2MoeModel</td><td>Model for Qwen2</td></tr>
<tr><td></td><td>KDeepseekV2Model</td><td>KDeepseekV2Model</td><td>Model for DeepseekV2</td></tr>
<tr><td>RoPE</td><td>RotaryEmbedding</td><td>RotaryEmbedding</td><td>RoPE module</td></tr>
<tr><td></td><td>YarnRotaryEmbedding</td><td>YarnRotaryEmbedding</td><td>RoPE module</td></tr>
</tbody>
</table>
</div>
<p>Then we start step-by-step injection of custom modules, our targets are:</p>
<ul>
<li>Replace the linear module with custom Marlin linear module.</li>
<li>Replace the self-attention module with a custom Absorption-based MLA module.</li>
<li>Replace the experts module with a custom Experts module.</li>
<li>Replace the MoE module with a custom MoE module.</li>
<li>Replace the RoPE module with a custom RoPE module.</li>
<li>Set the running device for each module.</li>
</ul>
<p>The full implementation of the injection rules can be found in the <a href="https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml">here</a>.</p>
<h2 id="matrix-absorption-based-mla-injection"><a class="header" href="#matrix-absorption-based-mla-injection">Matrix Absorption-based MLA Injection</a></h2>
<p>For the injection of the Attention module, we only need to use a regular expression to match the module names used in transformers and replace them with our own MLA module implementation. The YAML injection rule is as follows:</p>
<pre><code class="language-yaml">- match:
name: "^model\\.layers\\..*\\.self_attn$" # Regular expression
replace:
class: ktransformers.operators.attention.KDeepseekV2Attention # Optimized MLA implementation
</code></pre>
<p>As you can see, each rule in the YAML file has two parts: match and replace. The match part specifies the module to be replaced, and the replace part specifies the module to be injected into the model along with the initialization keywords.</p>
<h2 id="injection-of-routed-experts"><a class="header" href="#injection-of-routed-experts">Injection of Routed Experts</a></h2>
<p>For Routed Experts (corresponding to the exps in the diagram), the module we inject is CPUInfer, which is wrapped in the wrapper module KTransformersExperts. KTransformersExperts has multiple implementations, and we need to specify keywords to tell the wrapper module which implementation we want to use and how we plan to use it.</p>
<p>In the source code of the transformer, MoE is implemented using nn.ModuleList. We do not want KTransformers to traverse all submodules in the list and inject them one by one, so in this rule, we set recursive: False to prevent recursive injection into the submodules of this module. The YAML rule is as follows:</p>
<pre><code class="language-yaml">- match:
name: "^model\\.layers\\..*\\.mlp\\.experts$"
replace:
class: ktransformers.operators.experts.KTransformersExperts # Custom MoE kernel with expert parallelism
kwargs:
generate_device: "cpu"
generate_op: "MLPCPUExperts"
out_device: "cuda"
recursive: False # Don't recursively inject submodules of this module
</code></pre>
<p>If we inject Routed Experts as a custom module, we cannot use the interfaces in the original <code>nn.ModuleList</code>. Therefore, it is necessary to modify the forward function in the FFN module. The simplest method is to implement a new module with a custom forward function and inject it.</p>
<pre><code class="language-yaml">- match:
class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace:
class: ktransformers.operators.experts.KDeepseekV2MoE # MLP module with custom forward function
</code></pre>
<h2 id="injection-of-linear-layers"><a class="header" href="#injection-of-linear-layers">Injection of Linear Layers</a></h2>
<p>For the remaining linear layer modules, we aim to use quantized operators to save storage space while improving performance. Since there is no current research on using MLA and quantization together, we do not want to inject linear into the MLA operator. Therefore, we can modify the regular expression and add a type check in the match part of the rule. Only modules that match both the name and class simultaneously will be injected. We also need to pass some keywords similar to the injection of Routed Experts. The YAML rule is as follows:</p>
<pre><code class="language-yaml">- match:
name: "^model\\.layers\\.(?!.*self_attn).*$" # Regular expression
class: torch.nn.Linear # Only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformersLinear # Optimized kernel on quantized data types
kwargs:
generate_device: "cuda"
generate_op: "QuantizedLinearMarlin"
</code></pre>
<h2 id="injection-of-modules-with-pre-calculated-buffers"><a class="header" href="#injection-of-modules-with-pre-calculated-buffers">Injection of Modules with Pre-calculated Buffers</a></h2>
<p>To avoid occupying resources when initializing the injected original model, we use torchs meta device to initialize the original model. The RoPE module pre-calculates some buffers during initialization, but no calculations are performed when using the meta device. Therefore, we need to compensate for the calculation of the buffer when loading the model. Simply, we inject a custom module into the rotary embedding module, which performs pre-calculation during loading. The YAML rule is as follows:</p>
<pre><code class="language-yaml">- match:
class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace:
class: ktransformers.operators.RoPE.YarnRotaryEmbedding
</code></pre>
<h2 id="specifying-running-devices-for-modules"><a class="header" href="#specifying-running-devices-for-modules">Specifying Running Devices for Modules</a></h2>
<p>Finally, we set a fallback basic attribute generate_device for all modules:</p>
<pre><code class="language-yaml">- match:
name: "^model\\.layers\\..*\\.|^lm_head"
replace:
class: "default"
kwargs:
generate_device: "cuda"
- match:
name: "^model.embed_tokens"
replace:
class: "default"
kwargs:
generate_device: "cpu"
</code></pre>
<p>Through these two rules, we place all previously unmatched layers (and their submodules) and lm_head on cuda, and the embedding on cpu. Note that the properties of a module will be determined by the first rule it matches. For example, if you later set a new replace.kwargs.generate_device in an injected module, the device set earlier will take precedence. If your computer has multiple cards, you can also configure the model to multiple cards.</p>
<h2 id="muti-gpu"><a class="header" href="#muti-gpu">Muti-GPU</a></h2>
<p>If you have multiple GPUs, you can set the device for each module to different GPUs.
DeepseekV2-Chat got 60 layers, if we got 2 GPUs, we can allocate 30 layers to each GPU. Complete multi GPU rule examples <a href="https://github.com/kvcache-ai/ktransformers/blob/main/ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml">here</a>.</p>
<p align="center">
<picture>
<img alt="Inject-Struction" src="../../assets/multi_gpu.png" width="60%">
</picture>
</p>
<p>First of all, for multi-GPU, we have to inject an new operator <code>KDeepseekV2Model</code>. And set division of the layers to different GPUs. For our case, we have to set the <code>transfer_map</code> in the <code>KDeepseekV2Model</code> operatoras as follows:</p>
<pre><code class="language-yaml">- match:
name: "^model$"
replace:
class: "ktransformers.operators.models.KDeepseekV2Model"
kwargs:
transfer_map:
30: "cuda:1"
</code></pre>
<p>And we have to set the device for each module in the model.</p>
<p>For example, for <code>routed experts</code>, the yaml for one GPU is:</p>
<pre><code class="language-yaml">- match:
name: "^model\\.layers\\..*\\.mlp\\.experts$"
replace:
class: ktransformers.operators.experts.KTransformersExperts # Custom MoE kernel with expert parallelism
kwargs:
generate_device: "cuda:0"
generate_op: "MLPCUDAExperts"
out_device: "cuda:0"
recursive: False # Don't recursively inject submodules of this module
</code></pre>
<p>But for two GPUs, we need to set the device for each module in the model.</p>
<pre><code class="language-yaml"># allcate 0-29 layerss out_device to cuda:0
- match:
name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
replace:
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
kwargs:
generate_device: "cpu"
generate_op: "KExpertsCPU"
out_device: "cuda:0"
recursive: False # don't recursively inject submodules of this module
# allocate 30-59 layerss out_device to cuda:1
- match:
name: "^model\\.layers\\.([345][0-9])\\.mlp\\.experts$"
replace:
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
kwargs:
generate_device: "cpu"
generate_op: "KExpertsCPU"
out_device: "cuda:1"
recursive: False # don't recursively inject submodules of this module
</code></pre>
<p>For other modules, we can set the device in the same way.</p>
<h2 id="how-to-write-a-new-operator-and-inject-into-the-model"><a class="header" href="#how-to-write-a-new-operator-and-inject-into-the-model">How to Write a New Operator and Inject into the Model</a></h2>
<p>In this section, we will explain how to write an operator that can be injected, using the implementation of a new linear as an example.</p>
<p>First, all injectable operators need to inherit from the BaseInjectedModule class, which inherits some attributes required by our injection framework. Its initialization function needs to meet the following basic format:</p>
<pre><code class="language-python">class LinearTorchInject(BaseInjectedModule):
def __init__(
self,
key: str,
gguf_loader: GGUFLoader,
config: PretrainedConfig,
orig_module: nn.Module = None,
generate_device: str = "cuda",
**kwargs,
):
super().__init__(key, gguf_loader, config, orig_module, generate_device, **kwargs)
</code></pre>
<p>If users have other parameters that need to be passed to this class, they can also be included in the init function and re-passed in the kwargs parameter in the yaml file. For example, if our operator wants to pass a parameter <code>my_param</code>, the init function can be written as:</p>
<pre><code class="language-python">class LinearTorchInject(BaseInjectedModule):
def __init__(
self,
key: str,
gguf_loader: GGUFLoader,
config: PretrainedConfig,
orig_module: nn.Module = None,
generate_device: str = "cuda",
my_param: bool = True,
**kwargs,
):
super().__init__(key, gguf_loader, config, orig_module, generate_device, **kwargs)
self.my_param = my_param
</code></pre>
<p>Then our injection rule can be written as:</p>
<pre><code class="language-yaml">- match:
name: "^model\\.layers\\..*$" # Regular expression matches the module name.
class: torch.nn.Linear # Type restrictions can be added.
replace:
class: ktransformers.operators.linear.LinearTorchInject # Inject module path
kwargs: # Extra parameters
generate_device: "cuda"
my_param: True
</code></pre>
<p>For the linear module, it is also necessary to read weights from a gguf file. We provide the <code>KLinearBase</code> class to help users read weights from gguf files. Users only need to inherit and implement the load, unload, and forward functions. Therefore, a fully injectable linear class would look like this:</p>
<pre><code class="language-python">class LinearTorchInject(BaseInjectedModule, KLinearBase):
def __init__(
self,
key: str,
gguf_loader: GGUFLoader,
config: PretrainedConfig,
orig_module: nn.Module = None,
generate_device: str = "cuda",
**kwargs,
):
super().__init__(key, gguf_loader, config, orig_module, generate_device, **kwargs)
KLinearBase.__init__(self)
self.has_bias = False
self.dtype = torch.get_default_dtype()
self.w = None
self.has_bias = False
def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
if device is None: device = self.device
if w is None: w = self.load_weight(device=device)
if isinstance(w, nn.Parameter):
self.w = w.to(dtype=self.dtype).view(self.out_features, self.in_features).T
self.has_bias = False
elif isinstance(w, tuple):
self.w = w[0].to(dtype=self.dtype).view(self.out_features, self.in_features).T
self.bias = w[1].to(dtype=self.dtype)
self.has_bias = True
else:
raise ValueError("Invalid weight type")
self.w = self.w.to(device)
if self.has_bias:
self.bias = self.bias.to(device)
def unload(self):
if self.w is not None:
self.w = None
if self.has_bias:
self.bias = None
def forward(self, x: torch.Tensor) -&gt; torch.Tensor:
dtype = x.dtype
out_device = x.device
x = x.to(device=self.device, dtype=self.dtype)
x = x @ self.w
if self.has_bias:
x = x + self.bias
x = x.to(dtype=dtype, device=out_device)
return x
</code></pre>
<p>Note that the <code>self.load_weight</code> function is provided by the KLinearBase class to help users load weights from a gguf file into the module. The implementation details of KLinearBase can be found on <a href="https://github.com/kvcache-ai/ktransformers/blob/44f57270c9514d79fab224186d90ccf61059331a/ktransformers/operators/linear.py#L31">GITHUB</a>.</p>
</main>
<nav class="nav-wrapper" aria-label="Page navigation">
<!-- Mobile navigation buttons -->
<a rel="prev" href="../../en/SFT/KTransformers-Fine-Tuning_Developer-Technical-Notes.html" class="mobile-nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
<span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 320 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M41.4 233.4c-12.5 12.5-12.5 32.8 0 45.3l160 160c12.5 12.5 32.8 12.5 45.3 0s12.5-32.8 0-45.3L109.3 256 246.6 118.6c12.5-12.5 12.5-32.8 0-45.3s-32.8-12.5-45.3 0l-160 160z"/></svg></span>
</a>
<a rel="next prefetch" href="../../en/kt-kernel/index.html" class="mobile-nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
<span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 320 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M278.6 233.4c12.5 12.5 12.5 32.8 0 45.3l-160 160c-12.5 12.5-32.8 12.5-45.3 0s-12.5-32.8 0-45.3L210.7 256 73.4 118.6c-12.5-12.5-12.5-32.8 0-45.3s32.8-12.5 45.3 0l160 160z"/></svg></span>
</a>
<div style="clear: both"></div>
</nav>
</div>
</div>
<nav class="nav-wide-wrapper" aria-label="Page navigation">
<a rel="prev" href="../../en/SFT/KTransformers-Fine-Tuning_Developer-Technical-Notes.html" class="nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
<span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 320 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M41.4 233.4c-12.5 12.5-12.5 32.8 0 45.3l160 160c12.5 12.5 32.8 12.5 45.3 0s12.5-32.8 0-45.3L109.3 256 246.6 118.6c12.5-12.5 12.5-32.8 0-45.3s-32.8-12.5-45.3 0l-160 160z"/></svg></span>
</a>
<a rel="next prefetch" href="../../en/kt-kernel/index.html" class="nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
<span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 320 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M278.6 233.4c12.5 12.5 12.5 32.8 0 45.3l-160 160c-12.5 12.5-32.8 12.5-45.3 0s-12.5-32.8 0-45.3L210.7 256 73.4 118.6c-12.5-12.5-12.5-32.8 0-45.3s32.8-12.5 45.3 0l160 160z"/></svg></span>
</a>
</nav>
</div>
<template id=fa-eye><span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 576 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M288 32c-80.8 0-145.5 36.8-192.6 80.6C48.6 156 17.3 208 2.5 243.7c-3.3 7.9-3.3 16.7 0 24.6C17.3 304 48.6 356 95.4 399.4C142.5 443.2 207.2 480 288 480s145.5-36.8 192.6-80.6c46.8-43.5 78.1-95.4 93-131.1c3.3-7.9 3.3-16.7 0-24.6c-14.9-35.7-46.2-87.7-93-131.1C433.5 68.8 368.8 32 288 32zM432 256c0 79.5-64.5 144-144 144s-144-64.5-144-144s64.5-144 144-144s144 64.5 144 144zM288 192c0 35.3-28.7 64-64 64c-11.5 0-22.3-3-31.6-8.4c-.2 2.8-.4 5.5-.4 8.4c0 53 43 96 96 96s96-43 96-96s-43-96-96-96c-2.8 0-5.6 .1-8.4 .4c5.3 9.3 8.4 20.1 8.4 31.6z"/></svg></span></template>
<template id=fa-eye-slash><span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 640 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M38.8 5.1C28.4-3.1 13.3-1.2 5.1 9.2S-1.2 34.7 9.2 42.9l592 464c10.4 8.2 25.5 6.3 33.7-4.1s6.3-25.5-4.1-33.7L525.6 386.7c39.6-40.6 66.4-86.1 79.9-118.4c3.3-7.9 3.3-16.7 0-24.6c-14.9-35.7-46.2-87.7-93-131.1C465.5 68.8 400.8 32 320 32c-68.2 0-125 26.3-169.3 60.8L38.8 5.1zM223.1 149.5C248.6 126.2 282.7 112 320 112c79.5 0 144 64.5 144 144c0 24.9-6.3 48.3-17.4 68.7L408 294.5c5.2-11.8 8-24.8 8-38.5c0-53-43-96-96-96c-2.8 0-5.6 .1-8.4 .4c5.3 9.3 8.4 20.1 8.4 31.6c0 10.2-2.4 19.8-6.6 28.3l-90.3-70.8zm223.1 298L373 389.9c-16.4 6.5-34.3 10.1-53 10.1c-79.5 0-144-64.5-144-144c0-6.9 .5-13.6 1.4-20.2L83.1 161.5C60.3 191.2 44 220.8 34.5 243.7c-3.3 7.9-3.3 16.7 0 24.6c14.9 35.7 46.2 87.7 93 131.1C174.5 443.2 239.2 480 320 480c47.8 0 89.9-12.9 126.2-32.5z"/></svg></span></template>
<template id=fa-copy><span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M502.6 70.63l-61.25-61.25C435.4 3.371 427.2 0 418.7 0H255.1c-35.35 0-64 28.66-64 64l.0195 256C192 355.4 220.7 384 256 384h192c35.2 0 64-28.8 64-64V93.25C512 84.77 508.6 76.63 502.6 70.63zM464 320c0 8.836-7.164 16-16 16H255.1c-8.838 0-16-7.164-16-16L239.1 64.13c0-8.836 7.164-16 16-16h128L384 96c0 17.67 14.33 32 32 32h47.1V320zM272 448c0 8.836-7.164 16-16 16H63.1c-8.838 0-16-7.164-16-16L47.98 192.1c0-8.836 7.164-16 16-16H160V128H63.99c-35.35 0-64 28.65-64 64l.0098 256C.002 483.3 28.66 512 64 512h192c35.2 0 64-28.8 64-64v-32h-47.1L272 448z"/></svg></span></template>
<template id=fa-play><span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 384 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M73 39c-14.8-9.1-33.4-9.4-48.5-.9S0 62.6 0 80V432c0 17.4 9.4 33.4 24.5 41.9s33.7 8.1 48.5-.9L361 297c14.3-8.7 23-24.2 23-41s-8.7-32.2-23-41L73 39z"/></svg></span></template>
<template id=fa-clock-rotate-left><span class=fa-svg><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><!--! Font Awesome Free 6.2.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2022 Fonticons, Inc. --><path d="M75 75L41 41C25.9 25.9 0 36.6 0 57.9V168c0 13.3 10.7 24 24 24H134.1c21.4 0 32.1-25.9 17-41l-30.8-30.8C155 85.5 203 64 256 64c106 0 192 86 192 192s-86 192-192 192c-40.8 0-78.6-12.7-109.7-34.4c-14.5-10.1-34.4-6.6-44.6 7.9s-6.6 34.4 7.9 44.6C151.2 495 201.7 512 256 512c141.4 0 256-114.6 256-256S397.4 0 256 0C185.3 0 121.3 28.7 75 75zm181 53c-13.3 0-24 10.7-24 24V256c0 6.4 2.5 12.5 7 17l72 72c9.4 9.4 24.6 9.4 33.9 0s9.4-24.6 0-33.9l-65-65V152c0-13.3-10.7-24-24-24z"/></svg></span></template>
<script>
window.playground_copyable = true;
</script>
<script src="../../ace-2a3cd908.js"></script>
<script src="../../mode-rust-2c9d5c9a.js"></script>
<script src="../../editor-16ca416c.js"></script>
<script src="../../theme-dawn-4493f9c8.js"></script>
<script src="../../theme-tomorrow_night-9dbe62a9.js"></script>
<script src="../../elasticlunr-ef4e11c1.min.js"></script>
<script src="../../mark-09e88c2c.min.js"></script>
<script src="../../searcher-c2a407aa.js"></script>
<script src="../../clipboard-1626706a.min.js"></script>
<script src="../../highlight-abc7f01d.js"></script>
<script src="../../book-a0b12cfe.js"></script>
<!-- Custom JS scripts -->
</div>
</body>
</html>