From 1ff0fc138461c882db6a50b5545601182994ff2e Mon Sep 17 00:00:00 2001 From: Aleksander Grygier Date: Mon, 18 May 2026 16:09:40 +0200 Subject: [PATCH 01/33] ui: Refactor models store, MCP service, and gate logs behind VITE_DEBUG (#23236) * refactor: Scope console logs to `DEV` + `VITE_DEBUG` env vars * refactor: skip MCP proxy probe when no server requires it * refactor: suppress expected disconnect errors during MCP client shutdown * refactor: Deduplicate requests * refactor: deduplicate model fetching across ROUTER and MODEL modes * refactor: Clean up models logic * chore: Add `.env.example` file * refactor: replace client-side CORS proxy probe with server status flag * refactor: Post-review fixes * test: add vitest client setup with API fetch mocks --- tools/server/server-context.cpp | 1 + tools/server/server-models.cpp | 1 + tools/ui/.env.example | 2 + .../ChatFormActionModels.svelte | 28 +- .../ChatFormPickerMcpPrompts.svelte | 4 +- .../lib/hooks/use-models-selector.svelte.ts | 32 +- tools/ui/src/lib/services/mcp.service.ts | 59 ++- .../ui/src/lib/services/migration.service.ts | 59 ++- tools/ui/src/lib/stores/mcp.svelte.ts | 25 +- tools/ui/src/lib/stores/models.svelte.ts | 363 +++++++----------- tools/ui/src/lib/types/api.d.ts | 1 + tools/ui/src/lib/utils/api-key-validation.ts | 18 +- tools/ui/src/lib/utils/legacy-migration.ts | 17 +- tools/ui/src/routes/(chat)/+page.svelte | 17 +- tools/ui/src/routes/+layout.svelte | 49 +-- tools/ui/tests/client/page.svelte.test.ts | 7 +- tools/ui/vitest-setup-client.ts | 76 ++++ 17 files changed, 410 insertions(+), 349 deletions(-) create mode 100644 tools/ui/.env.example diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 0f3fb9efa..6b16c6b49 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -3885,6 +3885,7 @@ void server_routes::init_routes() { { "eos_token", meta->eos_token_str }, { "build_info", meta->build_info }, { "is_sleeping", queue_tasks.is_sleeping() }, + { "cors_proxy_enabled", params.ui_mcp_proxy || params.webui_mcp_proxy }, }; if (params.use_jinja) { if (!tmpl_tools.empty()) { diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 6c6fed52d..ccf42320f 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -1165,6 +1165,7 @@ void server_models_routes::init_routes() { // Deprecated: use ui_settings instead (kept for backward compat) {"webui_settings", webui_settings}, {"build_info", std::string(llama_build_info())}, + {"cors_proxy_enabled", params.ui_mcp_proxy || params.webui_mcp_proxy}, }); return res; } diff --git a/tools/ui/.env.example b/tools/ui/.env.example new file mode 100644 index 000000000..9a995b746 --- /dev/null +++ b/tools/ui/.env.example @@ -0,0 +1,2 @@ +VITE_PUBLIC_APP_NAME='llama-ui' +# VITE_DEBUG='true' diff --git a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte index 2f9471e0d..297020605 100644 --- a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionModels.svelte @@ -7,7 +7,6 @@ import { activeMessages } from '$lib/stores/conversations.svelte'; interface Props { - currentModel?: string; disabled?: boolean; forceForegroundText?: boolean; hasAudioModality?: boolean; @@ -20,7 +19,6 @@ } let { - currentModel, disabled = false, forceForegroundText = false, hasAudioModality = $bindable(false), @@ -41,14 +39,28 @@ let lastSyncedConversationModel: string | null = null; + let selectorModel = $derived(conversationModel ?? modelsStore.selectedModelName ?? null); + $effect(() => { if (conversationModel && conversationModel !== lastSyncedConversationModel) { - lastSyncedConversationModel = conversationModel; + if (modelOptions().some((m) => m.model === conversationModel)) { + modelsStore.selectedModelName = conversationModel; + modelsStore.selectModelByName(conversationModel); + } else { + modelsStore.selectedModelName = null; + modelsStore.clearSelection(); + } - modelsStore.selectModelByName(conversationModel); - } else if (isRouter && !modelsStore.selectedModelId && modelsStore.loadedModelIds.length > 0) { + lastSyncedConversationModel = conversationModel; + } else if ( + isRouter && + !modelsStore.selectedModelId && + modelsStore.loadedModelIds.length > 0 && + activeMessages().length > 0 && + !conversationModel + ) { lastSyncedConversationModel = null; - // auto-select the first loaded model only when nothing is selected yet + const first = modelOptions().find((m) => modelsStore.loadedModelIds.includes(m.model)); if (first) modelsStore.selectModelById(first.id); @@ -151,7 +163,7 @@ @@ -159,7 +171,7 @@ diff --git a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPickerMcpPrompts.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPickerMcpPrompts.svelte index 567fdac47..ff734ac88 100644 --- a/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPickerMcpPrompts.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatForm/ChatFormPickers/ChatFormPickerMcpPrompts/ChatFormPickerMcpPrompts.svelte @@ -162,7 +162,7 @@ return; } - if (import.meta.env.DEV) { + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) { console.log('[ChatFormPickerMcpPrompts] Fetching completions for:', { serverName: selectedPrompt.serverName, promptName: selectedPrompt.name, @@ -181,7 +181,7 @@ value ); - if (import.meta.env.DEV) { + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) { console.log('[ChatFormPickerMcpPrompts] Autocomplete result:', { argName, value, diff --git a/tools/ui/src/lib/hooks/use-models-selector.svelte.ts b/tools/ui/src/lib/hooks/use-models-selector.svelte.ts index 537a2af18..098cb2c27 100644 --- a/tools/ui/src/lib/hooks/use-models-selector.svelte.ts +++ b/tools/ui/src/lib/hooks/use-models-selector.svelte.ts @@ -66,7 +66,6 @@ export function useModelsSelector(opts: UseModelsSelectorOptions): UseModelsSele const serverModel = $derived(singleModelName()); const currentModel = $derived(opts.currentModel()); - const useGlobalSelection = $derived(opts.useGlobalSelection?.() ?? false); const onModelChange = $derived(opts.onModelChange?.()); const isHighlightedCurrentModelActive = $derived.by(() => { @@ -128,6 +127,7 @@ export function useModelsSelector(opts: UseModelsSelectorOptions): UseModelsSele if (onModelChange) { const result = await onModelChange(option.id, option.model); + if (result === false) { shouldCloseMenu = false; } @@ -142,12 +142,14 @@ export function useModelsSelector(opts: UseModelsSelectorOptions): UseModelsSele const textarea = document.querySelector( '[data-slot="chat-form"] textarea' ); + textarea?.focus(); }); } if (!onModelChange && isRouter && !modelsStore.isModelLoaded(option.model)) { isLoadingModel = true; + modelsStore .loadModel(option.model) .catch((error) => console.error('Failed to load model:', error)) @@ -158,6 +160,7 @@ export function useModelsSelector(opts: UseModelsSelectorOptions): UseModelsSele function getDisplayOption(): ModelOption | undefined { if (!isRouter) { const displayModel = serverModel || currentModel; + if (displayModel) { return { id: serverModel ? 'current' : 'offline-current', @@ -166,12 +169,8 @@ export function useModelsSelector(opts: UseModelsSelectorOptions): UseModelsSele capabilities: [] }; } - return undefined; - } - if (useGlobalSelection && activeId) { - const selected = options.find((option) => option.id === activeId); - if (selected) return selected; + return undefined; } if (currentModel) { @@ -183,6 +182,7 @@ export function useModelsSelector(opts: UseModelsSelectorOptions): UseModelsSele capabilities: [] }; } + return options.find((option) => option.model === currentModel); } @@ -197,57 +197,77 @@ export function useModelsSelector(opts: UseModelsSelectorOptions): UseModelsSele get options() { return options; }, + get loading() { return loading; }, + get updating() { return updating; }, + get activeId() { return activeId; }, + get isRouter() { return isRouter; }, + get serverModel() { return serverModel; }, + get isHighlightedCurrentModelActive() { return isHighlightedCurrentModelActive; }, + get isCurrentModelInCache() { return isCurrentModelInCache; }, + get filteredOptions() { return filteredOptions; }, + get groupedFilteredOptions() { return groupedFilteredOptions; }, + get isLoadingModel() { return isLoadingModel; }, + get searchTerm() { return searchTerm; }, + get showModelDialog() { return showModelDialog; }, + get infoModelId() { return infoModelId; }, + setSearchTerm(value: string) { searchTerm = value; }, + setShowModelDialog(value: boolean) { showModelDialog = value; }, + handleInfoClick, + handleSelect, + handleOpenChange, + isFavorite(model: string) { return modelsStore.favoriteModelIds.has(model); }, + getDisplayOption }; } diff --git a/tools/ui/src/lib/services/mcp.service.ts b/tools/ui/src/lib/services/mcp.service.ts index 44cbd4a8a..d596381aa 100644 --- a/tools/ui/src/lib/services/mcp.service.ts +++ b/tools/ui/src/lib/services/mcp.service.ts @@ -392,7 +392,7 @@ export class MCPService { const url = new URL(config.url); - if (import.meta.env.DEV) { + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) { console.log(`[MCPService] Creating WebSocket transport for ${url.href}`); } @@ -413,12 +413,12 @@ export class MCPService { onLog ); - if (useProxy && import.meta.env.DEV) { + if (useProxy && import.meta.env.DEV && import.meta.env.VITE_DEBUG) { console.log(`[MCPService] Using CORS proxy for ${config.url} -> ${url.href}`); } try { - if (import.meta.env.DEV) { + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) { console.log(`[MCPService] Creating StreamableHTTP transport for ${url.href}`); } @@ -520,7 +520,7 @@ export class MCPService { ) ); - if (import.meta.env.DEV) { + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) { console.log(`[MCPService][${serverName}] Creating transport...`); } @@ -560,6 +560,22 @@ export class MCPService { ); const runtimeErrorHandler = (error: Error) => { + // Ignore errors that are expected when the SDK's transport is closed, + // or when connecting to servers that don't support SSE (stateless-only + // endpoints returning 405). The SDK wraps the original AbortError in + // a new Error with the message "SSE stream disconnected: AbortError", + // and also produces "Cannot cancel a stream locked by a reader". + // DOMException is thrown by the browser when aborting fetch requests. + const msg = error.message || String(error); + if ( + error.name === 'AbortError' || + error instanceof DOMException || + msg.includes('SSE stream disconnected') || + msg.includes('stream locked by a reader') || + msg.includes('The operation was aborted') + ) { + return; + } console.error(`[MCPService][${serverName}] Protocol error after initialize:`, error); }; @@ -658,7 +674,10 @@ export class MCPService { this.createLog(MCPConnectionPhase.LISTING_TOOLS, 'Listing available tools...') ); - console.log(`[MCPService][${serverName}] Connected, listing tools...`); + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) { + console.log(`[MCPService][${serverName}] Connected, listing tools...`); + } + const tools = await this.listTools({ client, transport, @@ -680,10 +699,11 @@ export class MCPService { `Connection established with ${tools.length} tools (${connectionTimeMs}ms)` ) ); - - console.log( - `[MCPService][${serverName}] Initialization complete with ${tools.length} tools in ${connectionTimeMs}ms` - ); + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) { + console.log( + `[MCPService][${serverName}] Initialization complete with ${tools.length} tools in ${connectionTimeMs}ms` + ); + } return { client, @@ -709,9 +729,22 @@ export class MCPService { * @param connection - The active MCP connection to close */ static async disconnect(connection: MCPConnection): Promise { - console.log(`[MCPService][${connection.serverName}] Disconnecting...`); + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) { + console.log(`[MCPService][${connection.serverName}] Disconnecting...`); + } + try { - // Prevent reconnection on voluntary disconnect + // Terminate the session first for streamable-http transports to cleanly + // close streams, matching the inspector's disconnect flow. + if (connection.transport instanceof StreamableHTTPClientTransport) { + await connection.transport.terminateSession(); + } + + // Clear error handlers before closing to prevent noise from expected + // abort errors during shutdown. The inspector avoids this entirely + // by not setting onerror, but since we use it for protocol logging, + // we must clear it before disconnect. + connection.client.onerror = undefined; if (connection.transport.onclose) { connection.transport.onclose = undefined; } @@ -1078,7 +1111,9 @@ export class MCPService { try { await connection.client.unsubscribeResource({ uri }); - console.log(`[MCPService][${connection.serverName}] Unsubscribed from resource: ${uri}`); + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) { + console.log(`[MCPService][${connection.serverName}] Unsubscribed from resource: ${uri}`); + } } catch (error) { console.error( `[MCPService][${connection.serverName}] Failed to unsubscribe from resource:`, diff --git a/tools/ui/src/lib/services/migration.service.ts b/tools/ui/src/lib/services/migration.service.ts index 5ed24c00d..35d47070a 100644 --- a/tools/ui/src/lib/services/migration.service.ts +++ b/tools/ui/src/lib/services/migration.service.ts @@ -119,7 +119,8 @@ const localStorageMigration: Migration = { // Only migrate if new key doesn't already exist const newValue = localStorage.getItem(newKey); if (newValue !== null) { - console.log(`[Migration] localStorage: ${newKey} already exists, skipping`); + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) + console.log(`[Migration] localStorage: ${newKey} already exists, skipping`); continue; } @@ -127,9 +128,11 @@ const localStorageMigration: Migration = { if (oldValue !== null) { localStorage.setItem(newKey, oldValue); // Keep old key for downgrade compatibility - DO NOT DELETE - console.log( - `[Migration] localStorage: copied ${deprecatedKey} → ${newKey} (preserved old)` - ); + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) { + console.log( + `[Migration] localStorage: copied ${deprecatedKey} → ${newKey} (preserved old)` + ); + } } } } @@ -146,7 +149,8 @@ const idxdbMigration: Migration = { async run(): Promise { const oldDbNames = await Dexie.getDatabaseNames(); if (!oldDbNames.includes(DB_APP_NAME_DEPRECATED)) { - console.log('[Migration] IndexedDB: no old database found, skipping'); + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) + console.log('[Migration] IndexedDB: no old database found, skipping'); return; } @@ -155,11 +159,13 @@ const idxdbMigration: Migration = { newDb.version(1).stores(IDXDB_STORES); const existingConvs = await newDb.table(IDXDB_TABLES.conversations).count(); if (existingConvs > 0) { - console.log('[Migration] IndexedDB: new database already has data, skipping'); + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) + console.log('[Migration] IndexedDB: new database already has data, skipping'); return; } - console.log('[Migration] IndexedDB: copying from', DB_APP_NAME_DEPRECATED); + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) + console.log('[Migration] IndexedDB: copying from', DB_APP_NAME_DEPRECATED); const oldDb = new Dexie(DB_APP_NAME_DEPRECATED); oldDb.version(1).stores(IDXDB_STORES); @@ -169,15 +175,18 @@ const idxdbMigration: Migration = { if (conversations.length > 0) { await newDb.table(IDXDB_TABLES.conversations).bulkAdd(conversations); - console.log(`[Migration] IndexedDB: copied ${conversations.length} conversations`); + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) + console.log(`[Migration] IndexedDB: copied ${conversations.length} conversations`); } if (messages.length > 0) { await newDb.table(IDXDB_TABLES.messages).bulkAdd(messages); - console.log(`[Migration] IndexedDB: copied ${messages.length} messages`); + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) + console.log(`[Migration] IndexedDB: copied ${messages.length} messages`); } // Non-destructive: DO NOT delete old database - keep for downgrade compatibility - console.log('[Migration] IndexedDB: preserved old database for downgrade compatibility'); + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) + console.log('[Migration] IndexedDB: preserved old database for downgrade compatibility'); } }; @@ -419,7 +428,8 @@ const legacyMessageMigration: Migration = { } } - console.log(`[Migration] Legacy messages: migrated ${migratedCount} messages`); + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) + console.log(`[Migration] Legacy messages: migrated ${migratedCount} messages`); } }; @@ -434,7 +444,8 @@ const themeMigration: Migration = { async run(): Promise { const legacyTheme = localStorage.getItem('theme'); if (legacyTheme === null) { - console.log('[Migration] Theme: no legacy theme key found, skipping'); + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) + console.log('[Migration] Theme: no legacy theme key found, skipping'); return; } @@ -443,7 +454,8 @@ const themeMigration: Migration = { const config = configRaw ? JSON.parse(configRaw) : {}; if (SETTINGS_KEYS.THEME in config) { - console.log('[Migration] Theme: config already has theme, skipping'); + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) + console.log('[Migration] Theme: config already has theme, skipping'); return; } @@ -451,7 +463,8 @@ const themeMigration: Migration = { localStorage.setItem(CONFIG_LOCALSTORAGE_KEY, JSON.stringify(config)); // Non-destructive: DO NOT delete legacy theme key - keep for downgrade compatibility - console.log(`[Migration] Theme: copied standalone theme to config (preserved old key)`); + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) + console.log(`[Migration] Theme: copied standalone theme to config (preserved old key)`); } }; @@ -491,7 +504,8 @@ export const MigrationService = { */ resetState(): void { localStorage.removeItem(MIGRATION_STATE_KEY); - console.log('[Migration] State reset - all migrations will run again'); + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) + console.log('[Migration] State reset - all migrations will run again'); }, /** @@ -500,25 +514,30 @@ export const MigrationService = { */ async runAllMigrations(): Promise { const state = getMigrationState(); - console.log('[Migration] Starting migration run, state:', state); + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) + console.log('[Migration] Starting migration run, state:', state); for (const migration of migrations) { if (isMigrationCompleted(migration.id)) { - console.log(`[Migration] ${migration.id}: already completed, skipping`); + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) + console.log(`[Migration] ${migration.id}: already completed, skipping`); continue; } try { - console.log(`[Migration] ${migration.id}: running...`); + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) + console.log(`[Migration] ${migration.id}: running...`); await migration.run(); markMigrationCompleted(migration.id); - console.log(`[Migration] ${migration.id}: completed successfully`); + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) + console.log(`[Migration] ${migration.id}: completed successfully`); } catch (error) { console.error(`[Migration] ${migration.id}: failed`, error); markMigrationFailed(migration.id); } } - console.log('[Migration] All migrations complete'); + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) + console.log('[Migration] All migrations complete'); } }; diff --git a/tools/ui/src/lib/stores/mcp.svelte.ts b/tools/ui/src/lib/stores/mcp.svelte.ts index 8fb306da8..effb78e33 100644 --- a/tools/ui/src/lib/stores/mcp.svelte.ts +++ b/tools/ui/src/lib/stores/mcp.svelte.ts @@ -20,11 +20,11 @@ */ import { browser } from '$app/environment'; -import { base } from '$app/paths'; import { SETTINGS_KEYS } from '$lib/constants'; import { MCPService } from '$lib/services/mcp.service'; import { config, settingsStore } from '$lib/stores/settings.svelte'; import { mcpResourceStore } from '$lib/stores/mcp-resources.svelte'; +import { serverStore } from '$lib/stores/server.svelte'; import { mode } from 'mode-watcher'; import { parseMcpServerSettings, @@ -43,7 +43,6 @@ import { ToolCallType } from '$lib/enums'; import { - CORS_PROXY_ENDPOINT, DEFAULT_CACHE_TTL_MS, DEFAULT_MCP_CONFIG, EXPECTED_THEMED_ICON_PAIR_COUNT, @@ -86,7 +85,6 @@ class MCPStore { private _toolCount = $state(0); private _connectedServers = $state([]); private _healthChecks = $state>({}); - private _proxyAvailable = $state(false); private connections = new Map(); private toolsIndex = new Map(); @@ -96,27 +94,8 @@ class MCPStore { private initPromise: Promise | null = null; private activeFlowCount = 0; - constructor() { - if (browser) { - this.probeProxy(); - } - } - - /** - * Probes the CORS proxy endpoint to determine availability. - * The endpoint is only registered when llama-server runs with --ui-mcp-proxy. - */ - async probeProxy(): Promise { - try { - const response = await fetch(`${base}${CORS_PROXY_ENDPOINT}`, { method: 'HEAD' }); - this._proxyAvailable = response.status !== 404; - } catch { - this._proxyAvailable = false; - } - } - get isProxyAvailable(): boolean { - return this._proxyAvailable; + return serverStore.props?.cors_proxy_enabled ?? false; } /** diff --git a/tools/ui/src/lib/stores/models.svelte.ts b/tools/ui/src/lib/stores/models.svelte.ts index 45981b38f..bc99d7412 100644 --- a/tools/ui/src/lib/stores/models.svelte.ts +++ b/tools/ui/src/lib/stores/models.svelte.ts @@ -3,7 +3,7 @@ import { toast } from 'svelte-sonner'; import { ServerModelStatus, ModelModality } from '$lib/enums'; import { ModelsService } from '$lib/services/models.service'; import { PropsService } from '$lib/services/props.service'; -import { serverStore } from '$lib/stores/server.svelte'; +import { serverStore, isRouterMode } from '$lib/stores/server.svelte'; import { TTLCache } from '$lib/utils'; import { MODEL_PROPS_CACHE_TTL_MS, @@ -14,14 +14,7 @@ import { import { conversationsStore } from '$lib/stores/conversations.svelte'; /** - * modelsStore - Reactive store for model management in both MODEL and ROUTER modes - * - * This store manages: - * - Available models list - * - Selected model for new conversations - * - Loaded models tracking (ROUTER mode) - * - Model usage tracking per conversation - * - Automatic unloading of unused models + * modelsStore - Reactive store for model management in both MODEL and ROUTER modes. * * **Architecture & Relationships:** * - **ModelsService**: Stateless service for model API communication @@ -31,14 +24,8 @@ import { conversationsStore } from '$lib/stores/conversations.svelte'; * * **API Inconsistency Workaround:** * In MODEL mode, `/props` returns modalities for the single model. - * In ROUTER mode, `/props` has no modalities - must use `/props?model=` per model. + * In ROUTER mode, `/props` has no modalities — must use `/props?model=` per model. * This store normalizes this behavior so consumers don't need to know the server mode. - * - * **Key Features:** - * - **MODEL mode**: Single model, always loaded - * - **ROUTER mode**: Multi-model with load/unload capability - * - **Auto-unload**: Automatically unloads models not used by any conversation - * - **Lazy loading**: ensureModelLoaded() loads models on demand */ class ModelsStore { /** @@ -57,8 +44,8 @@ class ModelsStore { selectedModelId = $state(null); selectedModelName = $state(null); - // dedup concurrent fetch() callers, all awaiters share the same inflight promise - // without this, ?model= URL handler raced an in-progress fetch and saw an empty list + // Dedup concurrent fetch() callers — all awaiters share the same inflight promise. + // Without this, ?model= URL handler races an in-progress fetch and sees an empty list. private inflightFetch: Promise | null = null; private modelUsage = $state>>(new Map()); @@ -67,9 +54,9 @@ class ModelsStore { favoriteModelIds = $state>(this.loadFavoritesFromStorage()); /** - * Model-specific props cache with TTL - * Key: modelId, Value: props data including modalities - * TTL: 10 minutes - props don't change frequently + * Model-specific props cache with TTL. + * Key: modelId, Value: props data including modalities. + * TTL: 10 minutes — props don't change frequently. */ private modelPropsCache = new TTLCache({ ttlMs: MODEL_PROPS_CACHE_TTL_MS, @@ -78,7 +65,7 @@ class ModelsStore { private modelPropsFetching = $state>(new Set()); /** - * Version counter for props cache - used to trigger reactivity when props are updated + * Version counter for props cache — used to trigger reactivity when props are updated. */ propsCacheVersion = $state(0); @@ -92,7 +79,7 @@ class ModelsStore { get selectedModel(): ModelOption | null { if (!this.selectedModelId) return null; - return this.models.find((model) => model.id === this.selectedModelId) ?? null; + return this.models.find((m) => m.id === this.selectedModelId) ?? null; } get loadedModelIds(): string[] { @@ -117,7 +104,7 @@ class ModelsStore { * In ROUTER mode, returns null (model is per-conversation). */ get singleModelName(): string | null { - if (serverStore.isRouterMode) return null; + if (isRouterMode()) return null; const props = serverStore.props; if (props?.model_alias) return props.model_alias; @@ -126,6 +113,11 @@ class ModelsStore { return props.model_path.split(/(\\|\/)/).pop() || null; } + get selectedModelContextSize(): number | null { + if (!this.selectedModelName) return null; + return this.getModelContextSize(this.selectedModelName); + } + /** * * @@ -134,10 +126,6 @@ class ModelsStore { * */ - /** - * Get modalities for a specific model - * Returns cached modalities from model props - */ getModelModalities(modelId: string): ModelModalities | null { const model = this.models.find((m) => m.model === modelId || m.id === modelId); if (model?.modalities) { @@ -146,46 +134,29 @@ class ModelsStore { const props = this.modelPropsCache.get(modelId); if (props?.modalities) { - return { - vision: props.modalities.vision ?? false, - audio: props.modalities.audio ?? false, - video: props.modalities.video ?? false - }; + return this.buildModalities(props.modalities); } return null; } - /** - * Check if a model supports vision modality - */ modelSupportsVision(modelId: string): boolean { return this.getModelModalities(modelId)?.vision ?? false; } - /** - * Check if a model supports audio modality - */ modelSupportsAudio(modelId: string): boolean { return this.getModelModalities(modelId)?.audio ?? false; } - /** - * Check if a model supports video modality - */ modelSupportsVideo(modelId: string): boolean { return this.getModelModalities(modelId)?.video ?? false; } - /** - * Get model modalities as an array of ModelModality enum values - */ getModelModalitiesArray(modelId: string): ModelModality[] { const modalities = this.getModelModalities(modelId); if (!modalities) return []; const result: ModelModality[] = []; - if (modalities.vision) result.push(ModelModality.VISION); if (modalities.audio) result.push(ModelModality.AUDIO); if (modalities.video) result.push(ModelModality.VIDEO); @@ -193,16 +164,10 @@ class ModelsStore { return result; } - /** - * Get props for a specific model (from cache) - */ getModelProps(modelId: string): ApiLlamaCppServerProps | null { return this.modelPropsCache.get(modelId); } - /** - * Get context size (n_ctx) for a specific model from cached props - */ getModelContextSize(modelId: string): number | null { const props = this.getModelProps(modelId); const nCtx = props?.default_generation_settings?.n_ctx; @@ -210,17 +175,6 @@ class ModelsStore { return typeof nCtx === 'number' ? nCtx : null; } - /** - * Get context size for the currently selected model or null if no model is selected - */ - get selectedModelContextSize(): number | null { - if (!this.selectedModelName) return null; - return this.getModelContextSize(this.selectedModelName); - } - - /** - * Check if props are being fetched for a model - */ isModelPropsFetching(modelId: string): boolean { return this.modelPropsFetching.has(modelId); } @@ -235,10 +189,10 @@ class ModelsStore { isModelLoaded(modelId: string): boolean { const model = this.routerModels.find((m) => m.id === modelId); + return ( model?.status.value === ServerModelStatus.LOADED || - model?.status.value === ServerModelStatus.SLEEPING || - false + model?.status.value === ServerModelStatus.SLEEPING ); } @@ -248,6 +202,7 @@ class ModelsStore { getModelStatus(modelId: string): ServerModelStatus | null { const model = this.routerModels.find((m) => m.id === modelId); + return model?.status.value ?? null; } @@ -257,6 +212,7 @@ class ModelsStore { isModelInUse(modelId: string): boolean { const usage = this.modelUsage.get(modelId); + return usage !== undefined && usage.size > 0; } @@ -269,8 +225,8 @@ class ModelsStore { */ /** - * Fetch list of models from server and detect server role - * Also fetches modalities for MODEL mode (single model) + * Fetch list of models from server and detect server role. + * Also fetches modalities for MODEL mode (single model). */ async fetch(force = false): Promise { if (this.inflightFetch) return this.inflightFetch; @@ -293,69 +249,87 @@ class ModelsStore { await serverStore.fetch(); } - const response = await ModelsService.list(); + const router = isRouterMode(); - const models: ModelOption[] = response.data.map((item: ApiModelDataEntry, index: number) => { - const details = response.models?.[index]; - const rawCapabilities = Array.isArray(details?.capabilities) ? details?.capabilities : []; - const displayNameSource = - details?.name && details.name.trim().length > 0 ? details.name : item.id; - const displayName = this.toDisplayName(displayNameSource); - const modelId = details?.model || item.id; + if (router) { + const response = await ModelsService.listRouter(); - return { - id: item.id, - name: displayName, - model: modelId, - description: details?.description, - capabilities: rawCapabilities.filter((value: unknown): value is string => Boolean(value)), - details: details?.details, - meta: item.meta ?? null, - parsedId: ModelsService.parseModelId(modelId), - aliases: item.aliases ?? [], - tags: item.tags ?? [] - } satisfies ModelOption; - }); + this.routerModels = response.data; + this.models = this.buildModelOptions(response); - this.models = models; + await this.fetchModalitiesForLoadedModels(); - // WORKAROUND: In MODEL mode, /props returns modalities for the single model, - // but /v1/models doesn't include modalities. We bridge this gap here. - const serverProps = serverStore.props; - if (serverStore.isModelMode && this.models.length > 0 && serverProps?.modalities) { - const modalities: ModelModalities = { - vision: serverProps.modalities.vision ?? false, - audio: serverProps.modalities.audio ?? false, - video: serverProps.modalities.video ?? false - }; - this.modelPropsCache.set(this.models[0].model, serverProps); - this.models = this.models.map((model, index) => - index === 0 ? { ...model, modalities } : model - ); + const visible = this.getVisibleModels(); + + if (visible.length === 1 && this.isModelLoaded(visible[0].model)) { + this.selectModelById(visible[0].id); + } + } else { + this.models = await this.fetchModelModeInternal(); } } catch (error) { this.models = []; this.error = error instanceof Error ? error.message : 'Failed to load models'; + throw error; } finally { this.loading = false; } } + /** Fetch models in MODEL mode (single model, standard OpenAI-compatible). */ + private async fetchModelModeInternal(): Promise { + const response = await ModelsService.list(); + + return this.buildModelOptions(response); + } + /** - * Fetch router models with full metadata (ROUTER mode only) - * This fetches the /models endpoint which returns status info for each model + * Build ModelOption[] from an API response. + * Both MODEL and ROUTER modes share the same mapping logic; + * they differ only in which endpoint is called. + */ + private buildModelOptions( + response: ApiModelListResponse | ApiRouterModelsListResponse + ): ModelOption[] { + return response.data.map((item: ApiModelDataEntry, index: number) => { + const details = response.models?.[index]; + const rawCapabilities = Array.isArray(details?.capabilities) ? details?.capabilities : []; + const displayNameSource = + details?.name && details.name.trim().length > 0 ? details.name : item.id; + const modelId = details?.model || item.id; + + return { + id: item.id, + name: this.toDisplayName(displayNameSource), + model: modelId, + description: details?.description, + capabilities: rawCapabilities.filter((value: unknown): value is string => Boolean(value)), + details: details?.details, + meta: item.meta ?? null, + parsedId: ModelsService.parseModelId(modelId), + aliases: item.aliases ?? [], + tags: item.tags ?? [] + }; + }); + } + + /** + * Fetch router models with full metadata (ROUTER mode only). + * No-op in router mode — fetch() already calls listRouter() internally. + * Kept for API compatibility (e.g. handleOpenChange dropdown open handler). */ async fetchRouterModels(): Promise { + if (!isRouterMode()) return; + try { const response = await ModelsService.listRouter(); this.routerModels = response.data; await this.fetchModalitiesForLoadedModels(); - const o = this.models.filter((option) => this.getModelProps(option.model)?.ui !== false); - - if (o.length === 1 && this.isModelLoaded(o[0].model)) { - this.selectModelById(o[0].id); + const visible = this.getVisibleModels(); + if (visible.length === 1 && this.isModelLoaded(visible[0].model)) { + this.selectModelById(visible[0].id); } } catch (error) { console.warn('Failed to fetch router models:', error); @@ -364,10 +338,10 @@ class ModelsStore { } /** - * Fetch props for a specific model from /props endpoint - * Uses caching to avoid redundant requests + * Fetch props for a specific model from /props endpoint. + * Uses caching to avoid redundant requests. * - * In ROUTER mode, this will only fetch props if the model is loaded, + * In ROUTER mode, this only fetches props if the model is loaded, * since unloaded models return 400 from /props endpoint. * * @param modelId - Model identifier to fetch props for @@ -397,10 +371,7 @@ class ModelsStore { } } - /** - * Fetch modalities for all loaded models from /props endpoint - * This updates the modalities field in models array - */ + /** Fetch modalities for all loaded models from /props endpoint. */ async fetchModalitiesForLoadedModels(): Promise { const loadedModelIds = this.loadedModelIds; if (loadedModelIds.length === 0) return; @@ -410,7 +381,6 @@ class ModelsStore { try { const results = await Promise.all(propsPromises); - // Update models with modalities this.models = this.models.map((model) => { const modelIndex = loadedModelIds.indexOf(model.model); if (modelIndex === -1) return model; @@ -418,13 +388,7 @@ class ModelsStore { const props = results[modelIndex]; if (!props?.modalities) return model; - const modalities: ModelModalities = { - vision: props.modalities.vision ?? false, - audio: props.modalities.audio ?? false, - video: props.modalities.video ?? false - }; - - return { ...model, modalities }; + return { ...model, modalities: this.buildModalities(props.modalities) }; }); this.propsCacheVersion++; @@ -433,17 +397,38 @@ class ModelsStore { } } + /** + * Update modalities for a specific model. + * Called when a model is loaded or when we need fresh modality data. + */ + async updateModelModalities(modelId: string): Promise { + const props = await this.fetchModelProps(modelId); + if (!props?.modalities) return; + + this.models = this.models.map((model) => + model.model === modelId + ? { ...model, modalities: this.buildModalities(props.modalities!) } + : model + ); + + this.propsCacheVersion++; + } + + /** + * Filter to models visible in the UI (ui !== false). + */ + private getVisibleModels(): ModelOption[] { + return this.models.filter((option) => this.getModelProps(option.model)?.ui !== false); + } + /** * Gets the model name from the last assistant message in the active conversation. - * Iterates backward through messages to find the most recent message with a model. * Used by both the chat page and settings page to maintain model consistency. - * @returns The model name or null if not found */ getModelFromLastAssistantResponse(): string | null { const messages = conversationsStore.activeMessages; if (!messages || messages.length === 0) return null; - // Iterate backward to find the last message with a model for (let i = messages.length - 1; i >= 0; i--) { if (messages[i].model) { return messages[i].model; @@ -456,22 +441,13 @@ class ModelsStore { /** * Auto-selects the model from the last assistant response if available and loaded. * Returns true if a model was selected, false otherwise. - * This is used by the chat page to maintain model consistency across page navigation. */ async selectModelFromLastAssistantResponse(): Promise { const lastModel = this.getModelFromLastAssistantResponse(); - if (!lastModel) return false; - - // Skip if already selected - if (this.selectedModelName === lastModel) return false; + if (!lastModel || this.selectedModelName === lastModel) return false; const matchingModel = this.models.find((option) => option.model === lastModel); - if (!matchingModel) return false; - - if (!this.isModelLoaded(lastModel)) { - console.log('[modelsStore] last assistant model not loaded:', lastModel); - return false; - } + if (!matchingModel || !this.isModelLoaded(lastModel)) return false; try { await this.selectModelById(matchingModel.id); @@ -484,22 +460,17 @@ class ModelsStore { } /** - * Auto-selects the first available model if none is selected, and fetches its props. + * Auto-selects the first available model if none is selected. * Prioritizes: * 1. Model from active conversation's last assistant response (if loaded) * 2. Model from active conversation's last assistant response (if not loaded) * 3. First loaded model (not from active conversation) * 4. First available model - * This is used to ensure default values are populated in settings pages. */ async ensureFirstModelSelected(): Promise { if (this.selectedModelName) return; - // Filter models that are visible in the UI - const availableModels = this.models.filter( - (option) => this.getModelProps(option.model)?.ui !== false - ); - + const availableModels = this.getVisibleModels(); if (availableModels.length === 0) return; // Try to select model from last assistant response first @@ -515,7 +486,7 @@ class ModelsStore { } } - // Try to find a loaded model first + // Try a loaded model first const loadedModel = availableModels.find((m) => this.isModelLoaded(m.model)); if (loadedModel) { await this.selectModelById(loadedModel.id); @@ -524,34 +495,7 @@ class ModelsStore { } // Fall back to the first available model - const firstModel = availableModels[0]; - await this.selectModelById(firstModel.id); - // Don't fetch props for unloaded models (will fail in ROUTER mode) - } - - /** - * Update modalities for a specific model - * Called when a model is loaded or when we need fresh modality data - */ - async updateModelModalities(modelId: string): Promise { - try { - const props = await this.fetchModelProps(modelId); - if (!props?.modalities) return; - - const modalities: ModelModalities = { - vision: props.modalities.vision ?? false, - audio: props.modalities.audio ?? false, - video: props.modalities.video ?? false - }; - - this.models = this.models.map((model) => - model.model === modelId ? { ...model, modalities } : model - ); - - this.propsCacheVersion++; - } catch (error) { - console.warn(`Failed to update modalities for model ${modelId}:`, error); - } + await this.selectModelById(availableModels[0].id); } /** @@ -562,9 +506,6 @@ class ModelsStore { * */ - /** - * Select a model for new conversations - */ async selectModelById(modelId: string): Promise { if (!modelId || this.updating) return; if (this.selectedModelId === modelId) return; @@ -584,8 +525,7 @@ class ModelsStore { } /** - * Select a model by its model name (used for syncing with conversation model) - * @param modelName - Model name to select (e.g., "ggml-org/GLM-4.7-Flash-GGUF") + * Select a model by its model name (used for syncing with conversation model). */ selectModelByName(modelName: string): void { const option = this.models.find((model) => model.model === modelName); @@ -615,7 +555,7 @@ class ModelsStore { /** * * - * Loading/Unloading Models + * Loading / Unloading Models * * */ @@ -623,27 +563,18 @@ class ModelsStore { /** * WORKAROUND: Polling for model status after load/unload operations. * - * Currently, the `/models/load` and `/models/unload` endpoints return success - * before the operation actually completes on the server. This means an immediate - * request to `/models` returns stale status (e.g., "loading" after load request, - * "loaded" after unload request). + * Currently, `/models/load` and `/models/unload` return success before + * the operation actually completes on the server. * - * TODO: Remove this polling once llama-server properly waits for the operation - * to complete before returning success from `/load` and `/unload` endpoints. - * At that point, a single `fetchRouterModels()` call after the operation will - * be sufficient to get the correct status. + * TODO: Remove polling once llama-server properly waits for the operation + * to complete before returning success. */ - /** Polling interval in ms for checking model status */ private static readonly STATUS_POLL_INTERVAL = 500; /** * Poll for expected model status after load/unload operation. - * Keeps polling indefinitely until the model reaches the expected status or fails. - * - * @param modelId - Model identifier to check - * @param expectedStatus - Expected status to wait for - * @throws Error if model reaches FAILED status + * Keeps polling until the model reaches the expected status or fails. */ private async pollForModelStatus( modelId: string, @@ -654,9 +585,7 @@ class ModelsStore { await this.fetchRouterModels(); const currentStatus = this.getModelStatus(modelId); - if (currentStatus === expectedStatus) { - return; - } + if (currentStatus === expectedStatus) return; if (currentStatus === ServerModelStatus.FAILED) { throw new Error( @@ -677,15 +606,8 @@ class ModelsStore { } } - /** - * Load a model (ROUTER mode) - * @param modelId - Model identifier to load - */ async loadModel(modelId: string): Promise { - if (this.isModelLoaded(modelId)) { - return; - } - + if (this.isModelLoaded(modelId)) return; if (this.modelLoadingStates.get(modelId)) return; this.modelLoadingStates.set(modelId, true); @@ -694,7 +616,6 @@ class ModelsStore { try { await ModelsService.load(modelId); await this.pollForModelStatus(modelId, ServerModelStatus.LOADED); - await this.updateModelModalities(modelId); toast.success(`Model loaded: ${this.toDisplayName(modelId)}`); } catch (error) { @@ -706,15 +627,8 @@ class ModelsStore { } } - /** - * Unload a model (ROUTER mode) - * @param modelId - Model identifier to unload - */ async unloadModel(modelId: string): Promise { - if (!this.isModelLoaded(modelId)) { - return; - } - + if (!this.isModelLoaded(modelId)) return; if (this.modelLoadingStates.get(modelId)) return; this.modelLoadingStates.set(modelId, true); @@ -722,7 +636,6 @@ class ModelsStore { try { await ModelsService.unload(modelId); - await this.pollForModelStatus(modelId, ServerModelStatus.UNLOADED); toast.info(`Model unloaded: ${this.toDisplayName(modelId)}`); } catch (error) { @@ -734,15 +647,8 @@ class ModelsStore { } } - /** - * Ensure a model is loaded before use - * @param modelId - Model identifier to ensure is loaded - */ async ensureModelLoaded(modelId: string): Promise { - if (this.isModelLoaded(modelId)) { - return; - } - + if (this.isModelLoaded(modelId)) return; await this.loadModel(modelId); } @@ -779,11 +685,9 @@ class ModelsStore { private loadFavoritesFromStorage(): Set { try { const raw = localStorage.getItem(FAVORITE_MODELS_LOCALSTORAGE_KEY); - return raw ? new Set(JSON.parse(raw) as string[]) : new Set(); } catch { toast.error('Failed to load favorite models from local storage'); - return new Set(); } } @@ -799,10 +703,19 @@ class ModelsStore { private toDisplayName(id: string): string { const segments = id.split(/\\|\//); const candidate = segments.pop(); - return candidate && candidate.trim().length > 0 ? candidate : id; } + private buildModalities( + modalities: NonNullable + ): ModelModalities { + return { + vision: modalities.vision ?? false, + audio: modalities.audio ?? false, + video: modalities.video ?? false + }; + } + clear(): void { this.models = []; this.routerModels = []; diff --git a/tools/ui/src/lib/types/api.d.ts b/tools/ui/src/lib/types/api.d.ts index 316ad5528..5f0a38dd3 100644 --- a/tools/ui/src/lib/types/api.d.ts +++ b/tools/ui/src/lib/types/api.d.ts @@ -203,6 +203,7 @@ export interface ApiLlamaCppServerProps { /** @deprecated Use {@link ui_settings} instead */ webui_settings?: Record; ui_settings?: Record; + cors_proxy_enabled?: boolean; } export interface ApiChatCompletionRequest { diff --git a/tools/ui/src/lib/utils/api-key-validation.ts b/tools/ui/src/lib/utils/api-key-validation.ts index 948b7d7b6..dbbf9a09b 100644 --- a/tools/ui/src/lib/utils/api-key-validation.ts +++ b/tools/ui/src/lib/utils/api-key-validation.ts @@ -12,17 +12,21 @@ export async function validateApiKey(fetch: typeof globalThis.fetch): Promise = { - 'Content-Type': 'application/json' + 'Content-Type': 'application/json', + Authorization: `Bearer ${apiKey}` }; - if (apiKey) { - headers.Authorization = `Bearer ${apiKey}`; - } - const response = await fetch(`${base}/props`, { headers }); if (!response.ok) { diff --git a/tools/ui/src/lib/utils/legacy-migration.ts b/tools/ui/src/lib/utils/legacy-migration.ts index 19755f6ee..6b0890a36 100644 --- a/tools/ui/src/lib/utils/legacy-migration.ts +++ b/tools/ui/src/lib/utils/legacy-migration.ts @@ -333,7 +333,8 @@ async function migrateConversation(convId: string): Promise { export async function runLegacyMigration(): Promise { if (!isMigrationNeeded()) return; - console.log('[Migration] Starting legacy message format migration...'); + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) + console.log('[Migration] Starting legacy message format migration...'); try { const conversations = await DatabaseService.getAllConversations(); @@ -344,12 +345,14 @@ export async function runLegacyMigration(): Promise { totalMigrated += count; } - if (totalMigrated > 0) { - console.log( - `[Migration] Migrated ${totalMigrated} messages across ${conversations.length} conversations` - ); - } else { - console.log('[Migration] No legacy messages found, marking as done'); + if (import.meta.env.DEV && import.meta.env.VITE_DEBUG) { + if (totalMigrated > 0) { + console.log( + `[Migration] Migrated ${totalMigrated} messages across ${conversations.length} conversations` + ); + } else { + console.log('[Migration] No legacy messages found, marking as done'); + } } markMigrationDone(); diff --git a/tools/ui/src/routes/(chat)/+page.svelte b/tools/ui/src/routes/(chat)/+page.svelte index c272b438e..9db1d445f 100644 --- a/tools/ui/src/routes/(chat)/+page.svelte +++ b/tools/ui/src/routes/(chat)/+page.svelte @@ -3,7 +3,6 @@ import { chatStore } from '$lib/stores/chat.svelte'; import { conversationsStore, isConversationsInitialized } from '$lib/stores/conversations.svelte'; import { modelsStore, modelOptions } from '$lib/stores/models.svelte'; - import { isRouterMode } from '$lib/stores/server.svelte'; import { onMount } from 'svelte'; import { page } from '$app/state'; import { replaceState } from '$app/navigation'; @@ -72,23 +71,13 @@ conversationsStore.clearActiveConversation(); chatStore.clearUIState(); - if ( - isRouterMode() && - modelsStore.selectedModelName && - !modelsStore.isModelLoaded(modelsStore.selectedModelName) - ) { - modelsStore.clearSelection(); + await modelsStore.fetch(); - const first = modelOptions().find((m) => modelsStore.loadedModelIds.includes(m.model)); - if (first) { - await modelsStore.selectModelById(first.id); - } - } - - // Handle URL params only if we have ?q= or ?model= or ?new_chat=true if (qParam !== null || modelParam !== null || newChatParam === 'true') { await handleUrlParams(); } + + await modelsStore.ensureFirstModelSelected(); }); diff --git a/tools/ui/src/routes/+layout.svelte b/tools/ui/src/routes/+layout.svelte index e03d13fef..b35d20a5c 100644 --- a/tools/ui/src/routes/+layout.svelte +++ b/tools/ui/src/routes/+layout.svelte @@ -84,29 +84,34 @@ function checkApiKey() { const apiKey = config().apiKey; - if ( - (page.route.id === '/(chat)' || page.route.id === '/(chat)/chat/[id]') && - page.status !== 401 && - page.status !== 403 - ) { - const headers: Record = { - 'Content-Type': 'application/json' - }; - - if (apiKey && apiKey.trim() !== '') { - headers.Authorization = `Bearer ${apiKey.trim()}`; - } - - fetch(`${base}/props`, { headers }) - .then((response) => { - if (response.status === 401 || response.status === 403) { - window.location.reload(); - } - }) - .catch((e) => { - console.error('Error checking API key:', e); - }); + // No API key configured — server doesn't require auth, no need to validate. + // This mirrors the early return in validateApiKey() to avoid redundant /props requests. + if (!apiKey || apiKey.trim() === '') { + return; } + + untrack(() => { + if ( + (page.route.id === '/(chat)' || page.route.id === '/(chat)/chat/[id]') && + page.status !== 401 && + page.status !== 403 + ) { + const headers: Record = { + 'Content-Type': 'application/json', + Authorization: `Bearer ${apiKey.trim()}` + }; + + fetch(`${base}/props`, { headers }) + .then((response) => { + if (response.status === 401 || response.status === 403) { + window.location.reload(); + } + }) + .catch((e) => { + console.error('Error checking API key:', e); + }); + } + }); } function handleTitleUpdateCancel() { diff --git a/tools/ui/tests/client/page.svelte.test.ts b/tools/ui/tests/client/page.svelte.test.ts index 6849beb27..32e333d7f 100644 --- a/tools/ui/tests/client/page.svelte.test.ts +++ b/tools/ui/tests/client/page.svelte.test.ts @@ -4,8 +4,9 @@ import TestWrapper from './components/TestWrapper.svelte'; describe('/+page.svelte', () => { it('should render page without throwing', async () => { - // Basic smoke test - page should render without throwing errors - // API calls will fail in test environment but component should still mount - expect(() => render(TestWrapper)).not.toThrow(); + // Basic smoke test - page should render without throwing errors. + // API calls are mocked in vitest-setup-client.ts. + await render(TestWrapper); + expect(true).toBe(true); }); }); diff --git a/tools/ui/vitest-setup-client.ts b/tools/ui/vitest-setup-client.ts index 570b9f0e1..0b753db02 100644 --- a/tools/ui/vitest-setup-client.ts +++ b/tools/ui/vitest-setup-client.ts @@ -1,2 +1,78 @@ /// /// + +import { beforeEach, vi } from 'vitest'; + +// Mock fetch for API calls during client tests. +// In test environment there is no backend server, so we intercept +// the specific endpoints the app uses and return valid mock data. +beforeEach(() => { + const originalFetch = globalThis.fetch; + + vi.spyOn(globalThis, 'fetch').mockImplementation(async (input: RequestInfo | URL, init?: RequestInit) => { + const url = typeof input === 'string' ? input : input instanceof URL ? input.href : input.url; + + // Mock server props endpoint + if (url.includes('/server')) { + return new Response( + JSON.stringify({ + mode: 'router', + version: 'test', + git_commit: 'test', + git_branch: 'test' + }), + { status: 200, headers: { 'Content-Type': 'application/json' } } + ); + } + + // Mock models list endpoint + if (/\/v1\/models|\/models\b/.test(url)) { + return new Response( + JSON.stringify({ + object: 'list', + data: [ + { + id: 'test-model.gguf', + object: 'model', + owned_by: 'llamacpp', + created: 0, + in_cache: false, + path: 'models/test-model.gguf', + status: { value: 'unloaded' }, + meta: {} + } + ], + models: [ + { + model: 'test-model.gguf', + name: 'Test Model', + details: {} + } + ] + }), + { status: 200, headers: { 'Content-Type': 'application/json' } } + ); + } + + // Mock /props endpoint (used for modalities) + if (url.includes('/props')) { + return new Response( + JSON.stringify({ + default_generation_settings: { n_ctx: 2048 } + }), + { status: 200, headers: { 'Content-Type': 'application/json' } } + ); + } + + // Mock /tools endpoint (used for built-in tools list) + if (url.includes('/tools')) { + return new Response(JSON.stringify([]), { + status: 200, + headers: { 'Content-Type': 'application/json' } + }); + } + + // Default: use real fetch + return originalFetch(input, init); + }); +}); From b9a2170fce1f3f33cb4934b34efecb806bbbb348 Mon Sep 17 00:00:00 2001 From: Aleksander Grygier Date: Mon, 18 May 2026 16:17:21 +0200 Subject: [PATCH 02/33] feat: add scroll-to-bottom button to chat + prevent forced scroll down (#23270) --- .../app/chat/ChatScreen/ChatScreen.svelte | 9 +++- .../ChatScreenActionScrollDown.svelte | 48 +++++++++++++++++++ tools/ui/src/lib/components/app/chat/index.ts | 7 +++ .../src/lib/hooks/use-auto-scroll.svelte.ts | 8 ++++ 4 files changed, 70 insertions(+), 2 deletions(-) create mode 100644 tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte diff --git a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte index dc3eab134..bd93a569c 100644 --- a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte @@ -8,6 +8,7 @@ ChatMessages, ChatScreenDragOverlay, ChatScreenProcessingInfo, + ChatScreenActionScrollDown, DialogEmptyFileAlert, DialogFileUploadError, DialogChatError, @@ -338,7 +339,9 @@ }); function handleMessagesReady() { - if (!disableAutoScroll && !autoScroll.userScrolledUp) { + if (disableAutoScroll) return; + + if (!autoScroll.userScrolledUp) { requestAnimationFrame(() => { autoScroll.scrollToBottom('instant'); }); @@ -405,7 +408,7 @@
{#if isEmpty}
@@ -419,6 +422,8 @@
{/if} + + {#if page.params.id} {/if} diff --git a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte new file mode 100644 index 000000000..3f3ee8677 --- /dev/null +++ b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte @@ -0,0 +1,48 @@ + + +
+ +
diff --git a/tools/ui/src/lib/components/app/chat/index.ts b/tools/ui/src/lib/components/app/chat/index.ts index 5f6597980..9c7ce864e 100644 --- a/tools/ui/src/lib/components/app/chat/index.ts +++ b/tools/ui/src/lib/components/app/chat/index.ts @@ -667,3 +667,10 @@ export { default as ChatScreenForm } from './ChatScreen/ChatScreenForm.svelte'; * Only visible when `isCurrentConversationLoading` is true. */ export { default as ChatScreenProcessingInfo } from './ChatScreen/ChatScreenProcessingInfo.svelte'; + +/** + * Scroll-to-bottom action button. Displays a floating button when the user + * has scrolled up more than half a viewport height from the bottom. + * Takes the chat container element as a prop to manage scroll state internally. + */ +export { default as ChatScreenActionScrollDown } from './ChatScreen/ChatScreenActionScrollDown.svelte'; diff --git a/tools/ui/src/lib/hooks/use-auto-scroll.svelte.ts b/tools/ui/src/lib/hooks/use-auto-scroll.svelte.ts index f59e3ed4b..7bac452e4 100644 --- a/tools/ui/src/lib/hooks/use-auto-scroll.svelte.ts +++ b/tools/ui/src/lib/hooks/use-auto-scroll.svelte.ts @@ -100,6 +100,14 @@ export class AutoScrollController { this._autoScrollEnabled = true; } + /** + * Resets scroll state when switching conversations. + */ + resetScrollState(): void { + this._userScrolledUp = false; + this._autoScrollEnabled = true; + } + /** * Starts the auto-scroll interval for continuous scrolling during streaming. */ From 3a9c1b854dc9290be7cf8e4c89bd404770020161 Mon Sep 17 00:00:00 2001 From: Aleksander Grygier Date: Mon, 18 May 2026 16:26:01 +0200 Subject: [PATCH 03/33] ui: Update KaTeX package and clean up logs from `sass` warnings (#23275) * ui: migrate katex imports to @use to resolve SCSS deprecation warnings * ci: Use `ubuntu-slim` for CI (UI) workflow --- .github/workflows/ui-ci.yml | 4 ++-- tools/ui/package-lock.json | 6 +++--- tools/ui/src/styles/katex-custom.scss | 9 ++++++--- tools/ui/vite.config.ts | 12 ------------ 4 files changed, 11 insertions(+), 20 deletions(-) diff --git a/.github/workflows/ui-ci.yml b/.github/workflows/ui-ci.yml index 43d6e1256..7f6f467dd 100644 --- a/.github/workflows/ui-ci.yml +++ b/.github/workflows/ui-ci.yml @@ -41,7 +41,7 @@ jobs: ui-checks: name: UI Checks needs: ui-build - runs-on: ubuntu-24.04-arm + runs-on: ubuntu-slim continue-on-error: true steps: - name: Checkout code @@ -93,7 +93,7 @@ jobs: e2e-tests: name: E2E Tests needs: ui-build - runs-on: ubuntu-24.04-arm + runs-on: ubuntu-slim steps: - name: Checkout code uses: actions/checkout@v6 diff --git a/tools/ui/package-lock.json b/tools/ui/package-lock.json index bf23307b8..3686eb326 100644 --- a/tools/ui/package-lock.json +++ b/tools/ui/package-lock.json @@ -6008,9 +6008,9 @@ } }, "node_modules/katex": { - "version": "0.16.22", - "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.22.tgz", - "integrity": "sha512-XCHRdUw4lf3SKBaJe4EvgqIuWwkPSo9XoeO8GjQW94Bp7TWv9hNhzZjZ+OH9yf1UmLygb7DIT5GSFQiyt16zYg==", + "version": "0.16.47", + "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.47.tgz", + "integrity": "sha512-Eeo8Ys1doU1z+x8AZsPpQu+p/QcZBI5PeOo7QGQdy2x2m0MU/hYagBbGOmXwr5KVbEfVuWv9LpnQWeehogurjg==", "dev": true, "funding": [ "https://opencollective.com/katex", diff --git a/tools/ui/src/styles/katex-custom.scss b/tools/ui/src/styles/katex-custom.scss index 9c8b96ed5..0e385844a 100644 --- a/tools/ui/src/styles/katex-custom.scss +++ b/tools/ui/src/styles/katex-custom.scss @@ -8,6 +8,9 @@ $use-ttf: false; $font-folder: 'katex-fonts'; // Import KaTeX SCSS with overridden variables -// Note: @import is deprecated but required because KaTeX uses @import internally -// The deprecation warnings are from KaTeX's code and cannot be avoided -@import 'katex/src/styles/katex.scss'; +@use 'katex/src/styles/katex.scss' with ( + $use-woff2: true, + $use-woff: false, + $use-ttf: false, + $font-folder: 'katex-fonts' +); diff --git a/tools/ui/vite.config.ts b/tools/ui/vite.config.ts index d3db24bf2..f89a689d5 100644 --- a/tools/ui/vite.config.ts +++ b/tools/ui/vite.config.ts @@ -23,18 +23,6 @@ export default defineConfig({ minify: true }, - css: { - preprocessorOptions: { - scss: { - additionalData: ` - $use-woff2: true; - $use-woff: false; - $use-ttf: false; - ` - } - } - }, - plugins: [tailwindcss(), sveltekit(), devtoolsJson(), llamaCppBuildPlugin()], test: { From 45b455e66fc09abed65b7d52d42a4a29ba0d45d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Mon, 18 May 2026 17:11:47 +0200 Subject: [PATCH 04/33] common : remove hf cache migration (#23266) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- common/arg.cpp | 7 -- common/hf-cache.cpp | 274 -------------------------------------------- common/hf-cache.h | 4 - 3 files changed, 285 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index d7a935fc1..ab23b77e0 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -4,7 +4,6 @@ #include "chat.h" #include "common.h" #include "download.h" -#include "hf-cache.h" #include "json-schema-to-grammar.h" #include "log.h" #include "sampling.h" @@ -586,12 +585,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context // parse the first time to get -hf option (used for remote preset) parse_cli_args(); - // TODO: Remove later - try { - hf_cache::migrate_old_cache_to_hf_cache(params.hf_token, params.offline); - } catch (const std::exception & e) { - LOG_WRN("HF cache migration failed: %s\n", e.what()); - } // export_graph_ops loads only metadata const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS; diff --git a/common/hf-cache.cpp b/common/hf-cache.cpp index 20f33e4c7..ba7417a12 100644 --- a/common/hf-cache.cpp +++ b/common/hf-cache.cpp @@ -11,7 +11,6 @@ #include #include #include -#include // migration only #include #include #include @@ -336,15 +335,9 @@ hf_files get_repo_files(const std::string & repo_id, if (item["lfs"].contains("oid") && item["lfs"]["oid"].is_string()) { file.oid = item["lfs"]["oid"].get(); } - if (item["lfs"].contains("size") && item["lfs"]["size"].is_number()) { - file.size = item["lfs"]["size"].get(); - } } else if (item.contains("oid") && item["oid"].is_string()) { file.oid = item["oid"].get(); } - if (file.size == 0 && item.contains("size") && item["size"].is_number()) { - file.size = item["size"].get(); - } if (!file.oid.empty() && !is_valid_oid(file.oid)) { LOG_WRN("%s: skip invalid oid: %s\n", __func__, file.oid.c_str()); @@ -502,271 +495,4 @@ std::string finalize_file(const hf_file & file) { return file.final_path; } -// delete everything after this line, one day - -// copied from download.cpp without the tag part -struct gguf_split_info { - std::string prefix; // tag included - int index; - int count; -}; - -static gguf_split_info get_gguf_split_info(const std::string & path) { - static const std::regex re_split("^(.+)-([0-9]{5})-of-([0-9]{5})$", std::regex::icase); - std::smatch m; - - std::string prefix = path; - if (!string_remove_suffix(prefix, ".gguf")) { - return {}; - } - - int index = 1; - int count = 1; - - if (std::regex_match(prefix, m, re_split)) { - index = std::stoi(m[2].str()); - count = std::stoi(m[3].str()); - prefix = m[1].str(); - } - - return {std::move(prefix), index, count}; -} - -static std::pair parse_manifest_name(std::string & filename) { - static const std::regex re(R"(^manifest=([^=]+)=([^=]+)=.*\.json$)"); - std::smatch match; - if (std::regex_match(filename, match, re)) { - return {match[1].str(), match[2].str()}; - } - return {}; -} - -static std::string make_old_cache_filename(const std::string & owner, - const std::string & repo, - const std::string & filename) { - auto result = owner + "_" + repo + "_" + filename; - string_replace_all(result, "/", "_"); - return result; -} - -struct migrate_file { - std::string path; - std::string sha256; - size_t size; - fs::path old_path; - fs::path etag_path; - const hf_file * file; -}; - -using migrate_files = std::vector; - -static bool collect_file(const fs::path & old_cache, - const std::string & owner, - const std::string & repo, - const std::string & path, - const std::string & sha256, - const hf_files & files, - migrate_files & to_migrate) { - - const hf_file * file = nullptr; - - for (const auto & f : files) { - if (f.path == path) { - file = &f; - break; - } - } - - std::string old_filename = make_old_cache_filename(owner, repo, path); - fs::path old_path = old_cache / old_filename; - fs::path etag_path = old_path.string() + ".etag"; - - if (!fs::exists(old_path)) { - if (file && fs::exists(file->final_path)) { - return true; - } - LOG_WRN("%s: %s not found in old cache or HF cache\n", __func__, old_filename.c_str()); - return false; - } - - if (!file) { - LOG_WRN("%s: %s not found in current repo\n", __func__, old_filename.c_str()); - return false; - } - - if (!sha256.empty() && !file->oid.empty() && sha256 != file->oid) { - LOG_WRN("%s: %s is not up to date (sha256 mismatch)\n", __func__, old_filename.c_str()); - return false; - } - - if (file->size > 0) { - size_t size = fs::file_size(old_path); - if (size != file->size) { - LOG_WRN("%s: %s has wrong size %zu (expected %zu)\n", __func__, old_filename.c_str(), size, file->size); - return false; - } - } - - to_migrate.push_back({path, sha256, file->size, old_path, etag_path, file}); - return true; -} - -static bool collect_files(const fs::path & old_cache, - const std::string & owner, - const std::string & repo, - const nl::json & node, - const hf_files & files, - migrate_files & to_migrate) { - - if (!node.contains("rfilename") || - !node.contains("lfs") || - !node["lfs"].contains("sha256")) { - return true; - } - - std::string path = node["rfilename"]; - std::string sha256 = node["lfs"]["sha256"]; - - auto split = get_gguf_split_info(path); - - if (split.count <= 1) { - return collect_file(old_cache, owner, repo, path, sha256, files, to_migrate); - } - - std::vector> splits; - - for (const auto & f : files) { - auto split_f = get_gguf_split_info(f.path); - if (split_f.count == split.count && split_f.prefix == split.prefix) { - // sadly the manifest only provides the sha256 of the first file (index == 1) - // the rest will be verified using the size... - std::string f_sha256 = (split_f.index == 1) ? sha256 : ""; - splits.emplace_back(f.path, f_sha256); - } - } - - if ((int)splits.size() != split.count) { - LOG_WRN("%s: expected %d split files but found %d in repo\n", __func__, split.count, (int)splits.size()); - return false; - } - - for (const auto & [f_path, f_sha256] : splits) { - if (!collect_file(old_cache, owner, repo, f_path, f_sha256, files, to_migrate)) { - return false; - } - } - - return true; -} - -static bool migrate_file(const migrate_file & file) { - std::error_code ec; - - fs::path new_path(file.file->local_path); - fs::create_directories(new_path.parent_path(), ec); - - if (!fs::exists(new_path, ec)) { - fs::rename(file.old_path, new_path, ec); - if (ec) { - fs::copy_file(file.old_path, new_path, ec); - if (ec) { - LOG_ERR("%s: failed to move/copy %s: %s\n", __func__, file.old_path.string().c_str(), ec.message().c_str()); - return false; - } - } - fs::remove(file.old_path, ec); - } - fs::remove(file.etag_path, ec); - - std::string filename = finalize_file(*file.file); - LOG_INF("%s: migrated %s -> %s\n", __func__, file.old_path.filename().string().c_str(), filename.c_str()); - return true; -} - -void migrate_old_cache_to_hf_cache(const std::string & token, bool offline) { - fs::path old_cache = fs_get_cache_directory(); - if (!fs::exists(old_cache)) { - return; - } - - if (offline) { - LOG_WRN("%s: skipping migration in offline mode (will run when online)\n", __func__); - return; // -hf is not going to work - } - - bool warned = false; - - for (const auto & entry : fs::directory_iterator(old_cache)) { - if (!entry.is_regular_file()) { - continue; - } - auto filename = entry.path().filename().string(); - auto [owner, repo] = parse_manifest_name(filename); - - if (owner.empty() || repo.empty()) { - continue; - } - - if (!warned) { - warned = true; - LOG_WRN("================================================================================\n" - "WARNING: Migrating cache to HuggingFace cache directory\n" - " Old cache: %s\n" - " New cache: %s\n" - "This one-time migration moves models previously downloaded with -hf\n" - "from the legacy llama.cpp cache to the standard HuggingFace cache.\n" - "Models downloaded with --model-url are not affected.\n" - "================================================================================\n", - old_cache.string().c_str(), get_cache_directory().string().c_str()); - } - - auto repo_id = owner + "/" + repo; - auto files = get_repo_files(repo_id, token); - - if (files.empty()) { - LOG_WRN("%s: could not get repo files for %s, skipping\n", __func__, repo_id.c_str()); - continue; - } - - migrate_files to_migrate; - bool ok = true; - - try { - std::ifstream manifest(entry.path()); - auto json = nl::json::parse(manifest); - for (const char * key : {"ggufFile", "mmprojFile"}) { - if (json.contains(key)) { - if (!collect_files(old_cache, owner, repo, json[key], files, to_migrate)) { - ok = false; - break; - } - } - } - } catch (const std::exception & e) { - LOG_WRN("%s: failed to parse manifest %s: %s\n", __func__, filename.c_str(), e.what()); - continue; - } - - if (!ok) { - LOG_WRN("%s: migration skipped: one or more files failed validation\n", __func__); - continue; - } - - for (const auto & file : to_migrate) { - if (!migrate_file(file)) { - ok = false; - break; - } - } - - if (!ok) { - LOG_WRN("%s: migration failed: could not migrate all files\n", __func__); - continue; - } - - LOG_INF("%s: migration complete, deleting manifest: %s\n", __func__, entry.path().string().c_str()); - fs::remove(entry.path()); - } -} - } // namespace hf_cache diff --git a/common/hf-cache.h b/common/hf-cache.h index 9e46f9774..23fa0adb7 100644 --- a/common/hf-cache.h +++ b/common/hf-cache.h @@ -14,7 +14,6 @@ struct hf_file { std::string final_path; std::string oid; std::string repo_id; - size_t size = 0; // only for the migration }; using hf_files = std::vector; @@ -30,7 +29,4 @@ hf_files get_cached_files(const std::string & repo_id = {}); // Create snapshot path (link or move/copy) and return it std::string finalize_file(const hf_file & file); -// TODO: Remove later -void migrate_old_cache_to_hf_cache(const std::string & token, bool offline = false); - } // namespace hf_cache From 5cbaa5e69e09bde3334cd8c355570553a0dca027 Mon Sep 17 00:00:00 2001 From: SamareshSingh <97642706+ssam18@users.noreply.github.com> Date: Mon, 18 May 2026 15:14:45 -0500 Subject: [PATCH 05/33] docker : add OCI image labels for version and build date (#21653) * docker: add OCI image labels to all published images * docker: propagate OCI labels as manifest and index annotations * docker: drop hardcoded org URL and revert accidental intel version bump The OCI image url and source are now driven by build args with a sensible default. The workflow passes the actual repository url so fork builds get labels pointing at the fork instead of upstream. Also restores the IGC, compute runtime, and IGDGMM versions in the intel Dockerfile labeled stage which I accidentally bumped in the first commit. * docker: add skip_s390x workflow_dispatch input for fast test runs Lets maintainers and PR authors trigger the docker workflow without the s390x build target, which depends on the IBM Z runner and is by far the slowest job in the matrix. The flag filters the s390x row out of the build matrix before merge_matrix is derived, so the merge job sees a consistent shape too. Signed-off-by: Samaresh Kumar Singh --------- Signed-off-by: Samaresh Kumar Singh --- .devops/cann.Dockerfile | 16 ++++++ .devops/cpu.Dockerfile | 16 ++++++ .devops/cuda.Dockerfile | 17 +++++++ .devops/intel.Dockerfile | 16 ++++++ .devops/llama-cli-cann.Dockerfile | 17 +++++++ .devops/musa.Dockerfile | 17 +++++++ .devops/openvino.Dockerfile | 16 ++++++ .devops/rocm.Dockerfile | 17 +++++++ .devops/s390x.Dockerfile | 16 ++++++ .devops/vulkan.Dockerfile | 16 ++++++ .github/workflows/docker.yml | 83 +++++++++++++++++++++++++++++-- 11 files changed, 242 insertions(+), 5 deletions(-) diff --git a/.devops/cann.Dockerfile b/.devops/cann.Dockerfile index 843fe37d0..acd1e26bc 100644 --- a/.devops/cann.Dockerfile +++ b/.devops/cann.Dockerfile @@ -5,6 +5,9 @@ # Define the CANN base image for easier version updates later ARG CHIP_TYPE=910b ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.5.0-${CHIP_TYPE}-openeuler24.03-py3.11 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A # ============================================================================== # BUILD STAGE @@ -67,6 +70,19 @@ RUN mkdir -p /app/full && \ # ============================================================================== FROM ${CANN_BASE_IMAGE} AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + # -- Install runtime dependencies -- RUN yum install -y libgomp curl && \ yum clean all && \ diff --git a/.devops/cpu.Dockerfile b/.devops/cpu.Dockerfile index d6579ecf1..c8f32235d 100644 --- a/.devops/cpu.Dockerfile +++ b/.devops/cpu.Dockerfile @@ -1,4 +1,7 @@ ARG UBUNTU_VERSION=24.04 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A FROM ubuntu:$UBUNTU_VERSION AS build @@ -35,6 +38,19 @@ RUN mkdir -p /app/full \ ## Base image FROM ubuntu:$UBUNTU_VERSION AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + RUN apt-get update \ && apt-get install -y libgomp1 curl \ && apt autoremove -y \ diff --git a/.devops/cuda.Dockerfile b/.devops/cuda.Dockerfile index b3f6ccfc9..3805ea3a0 100644 --- a/.devops/cuda.Dockerfile +++ b/.devops/cuda.Dockerfile @@ -6,6 +6,10 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A + FROM ${BASE_CUDA_DEV_CONTAINER} AS build # CUDA architecture to build for (defaults to all supported archs) @@ -40,6 +44,19 @@ RUN mkdir -p /app/full \ ## Base image FROM ${BASE_CUDA_RUN_CONTAINER} AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + RUN apt-get update \ && apt-get install -y libgomp1 curl \ && apt autoremove -y \ diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile index da164dcfa..218418b80 100644 --- a/.devops/intel.Dockerfile +++ b/.devops/intel.Dockerfile @@ -1,4 +1,7 @@ ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A ## Build Image @@ -40,6 +43,19 @@ RUN mkdir -p /app/full \ FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + ARG IGC_VERSION=v2.20.5 ARG IGC_VERSION_FULL=2_2.20.5+19972 ARG COMPUTE_RUNTIME_VERSION=25.40.35563.10 diff --git a/.devops/llama-cli-cann.Dockerfile b/.devops/llama-cli-cann.Dockerfile index d54e70838..447d871ac 100644 --- a/.devops/llama-cli-cann.Dockerfile +++ b/.devops/llama-cli-cann.Dockerfile @@ -1,4 +1,7 @@ ARG ASCEND_VERSION=8.5.0-910b-openeuler22.03-py3.10 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A FROM ascendai/cann:$ASCEND_VERSION AS build @@ -28,6 +31,20 @@ RUN echo "Building with static libs" && \ # TODO: use image with NNRT FROM ascendai/cann:$ASCEND_VERSION AS runtime + +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion / ENV LC_ALL=C.utf8 diff --git a/.devops/musa.Dockerfile b/.devops/musa.Dockerfile index 665a76f58..a7f70b5f0 100644 --- a/.devops/musa.Dockerfile +++ b/.devops/musa.Dockerfile @@ -6,6 +6,10 @@ ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_V ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A + FROM ${BASE_MUSA_DEV_CONTAINER} AS build # MUSA architecture to build for (defaults to all supported archs) @@ -45,6 +49,19 @@ RUN mkdir -p /app/full \ ## Base image FROM ${BASE_MUSA_RUN_CONTAINER} AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + RUN apt-get update \ && apt-get install -y libgomp1 curl \ && apt autoremove -y \ diff --git a/.devops/openvino.Dockerfile b/.devops/openvino.Dockerfile index 31b58736d..1266713f3 100644 --- a/.devops/openvino.Dockerfile +++ b/.devops/openvino.Dockerfile @@ -18,6 +18,10 @@ ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2 ARG http_proxy= ARG https_proxy= +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A + ## Build Image FROM ubuntu:${UBUNTU_VERSION} AS build @@ -88,6 +92,18 @@ FROM ubuntu:${UBUNTU_VERSION} AS base # Pass proxy args to runtime stage ARG http_proxy ARG https_proxy +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE RUN apt-get update \ && apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \ diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile index 525ddc790..2da15975d 100644 --- a/.devops/rocm.Dockerfile +++ b/.devops/rocm.Dockerfile @@ -7,6 +7,10 @@ ARG AMDGPU_VERSION=7.2.1 # Target the ROCm build image ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A + ### Build image FROM ${BASE_ROCM_DEV_CONTAINER} AS build @@ -57,6 +61,19 @@ RUN mkdir -p /app/full \ ## Base image FROM ${BASE_ROCM_DEV_CONTAINER} AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + RUN apt-get update \ && apt-get install -y libgomp1 curl \ && apt autoremove -y \ diff --git a/.devops/s390x.Dockerfile b/.devops/s390x.Dockerfile index 757cd97cd..d36f5f3cc 100644 --- a/.devops/s390x.Dockerfile +++ b/.devops/s390x.Dockerfile @@ -1,5 +1,8 @@ ARG GCC_VERSION=15.2.0 ARG UBUNTU_VERSION=24.04 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A ### Build Llama.cpp stage FROM gcc:${GCC_VERSION} AS build @@ -52,6 +55,19 @@ COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py ### Base image FROM ubuntu:${UBUNTU_VERSION} AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \ apt update -y && \ diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile index f4d199ed4..464ccfef1 100644 --- a/.devops/vulkan.Dockerfile +++ b/.devops/vulkan.Dockerfile @@ -1,4 +1,7 @@ ARG UBUNTU_VERSION=26.04 +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A FROM ubuntu:$UBUNTU_VERSION AS build @@ -31,6 +34,19 @@ RUN mkdir -p /app/full \ ## Base image FROM ubuntu:$UBUNTU_VERSION AS base +ARG BUILD_DATE=N/A +ARG APP_VERSION=N/A +ARG APP_REVISION=N/A +ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp +ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp +LABEL org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.version=$APP_VERSION \ + org.opencontainers.image.revision=$APP_REVISION \ + org.opencontainers.image.title="llama.cpp" \ + org.opencontainers.image.description="LLM inference in C/C++" \ + org.opencontainers.image.url=$IMAGE_URL \ + org.opencontainers.image.source=$IMAGE_SOURCE + RUN apt-get update \ && apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \ libglvnd0 libgl1 libglx0 libegl1 libgles2 \ diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index a5bae7141..6f1f2721e 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -11,6 +11,11 @@ name: Publish Docker image on: workflow_dispatch: # allows manual triggering + inputs: + skip_s390x: + description: "Skip the s390x build target (useful for fast test runs that do not need the IBM Z runner)" + type: boolean + default: false schedule: # Rebuild daily rather than on every push because it is expensive - cron: '12 4 * * *' @@ -64,6 +69,8 @@ jobs: - name: Generate build and merge matrices id: matrices shell: bash + env: + SKIP_S390X: ${{ inputs.skip_s390x || 'false' }} run: | set -euo pipefail @@ -86,6 +93,11 @@ jobs: ] JSON + if [ "${SKIP_S390X}" = "true" ]; then + jq 'map(select(.platforms != "linux/s390x"))' build-matrix.json > build-matrix.json.tmp + mv build-matrix.json.tmp build-matrix.json + fi + BUILD_MATRIX="$(jq -c . build-matrix.json)" MERGE_MATRIX="$(jq -c ' reduce .[] as $entry ({}; .[$entry.tag] |= ( @@ -132,6 +144,7 @@ jobs: config: ${{ fromJSON(needs.prepare_matrices.outputs.build_matrix) }} steps: - name: Check out the repo + id: checkout uses: actions/checkout@v6 with: fetch-depth: 0 @@ -187,6 +200,10 @@ jobs: env: GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}' + - name: Get build date + id: build_date + run: echo "date=$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_OUTPUT + - name: Free Disk Space (Ubuntu) if: ${{ matrix.config.free_disk_space == true }} uses: ggml-org/free-disk-space@v1.3.1 @@ -211,13 +228,26 @@ jobs: with: context: . platforms: ${{ matrix.config.platforms }} - outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true + outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true file: ${{ matrix.config.dockerfile }} target: full provenance: false build-args: | + BUILD_DATE=${{ steps.build_date.outputs.date }} + APP_VERSION=${{ needs.create_tag.outputs.source_tag }} + APP_REVISION=${{ steps.checkout.outputs.commit }} + IMAGE_URL=${{ github.server_url }}/${{ github.repository }} + IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }} ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }} ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }} + annotations: | + manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }} + manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }} + manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }} + manifest:org.opencontainers.image.title=llama.cpp + manifest:org.opencontainers.image.description=LLM inference in C/C++ + manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }} + manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }} # using github experimental cache #cache-from: type=gha #cache-to: type=gha,mode=max @@ -235,13 +265,26 @@ jobs: with: context: . platforms: ${{ matrix.config.platforms }} - outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true + outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true file: ${{ matrix.config.dockerfile }} target: light provenance: false build-args: | + BUILD_DATE=${{ steps.build_date.outputs.date }} + APP_VERSION=${{ needs.create_tag.outputs.source_tag }} + APP_REVISION=${{ steps.checkout.outputs.commit }} + IMAGE_URL=${{ github.server_url }}/${{ github.repository }} + IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }} ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }} ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }} + annotations: | + manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }} + manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }} + manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }} + manifest:org.opencontainers.image.title=llama.cpp + manifest:org.opencontainers.image.description=LLM inference in C/C++ + manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }} + manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }} # using github experimental cache #cache-from: type=gha #cache-to: type=gha,mode=max @@ -259,13 +302,26 @@ jobs: with: context: . platforms: ${{ matrix.config.platforms }} - outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true + outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true file: ${{ matrix.config.dockerfile }} target: server provenance: false build-args: | + BUILD_DATE=${{ steps.build_date.outputs.date }} + APP_VERSION=${{ needs.create_tag.outputs.source_tag }} + APP_REVISION=${{ steps.checkout.outputs.commit }} + IMAGE_URL=${{ github.server_url }}/${{ github.repository }} + IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }} ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }} ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }} + annotations: | + manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }} + manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }} + manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }} + manifest:org.opencontainers.image.title=llama.cpp + manifest:org.opencontainers.image.description=LLM inference in C/C++ + manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }} + manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }} # using github experimental cache #cache-from: type=gha #cache-to: type=gha,mode=max @@ -330,10 +386,15 @@ jobs: steps: - name: Check out the repo + id: checkout uses: actions/checkout@v6 with: fetch-depth: 0 + - name: Get build date + id: build_date + run: echo "date=$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_OUTPUT + - name: Download digest metadata uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8 with: @@ -361,6 +422,8 @@ jobs: IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}" PREFIX="${IMAGE_REPO}:" SRC_TAG="${{ needs.create_tag.outputs.source_tag }}" + BUILD_DATE="${{ steps.build_date.outputs.date }}" + COMMIT_SHA="${{ steps.checkout.outputs.commit }}" TAGS="${{ matrix.config.tag }}" ARCHES="${{ matrix.config.arches }}" DIGEST_GLOB="/tmp/digests/*.tsv" @@ -412,11 +475,21 @@ jobs: refs+=("${IMAGE_REPO}@${digest}") done + local annotations=( + --annotation "index:org.opencontainers.image.created=${BUILD_DATE}" + --annotation "index:org.opencontainers.image.version=${SRC_TAG}" + --annotation "index:org.opencontainers.image.revision=${COMMIT_SHA}" + --annotation "index:org.opencontainers.image.title=llama.cpp" + --annotation "index:org.opencontainers.image.description=LLM inference in C/C++" + --annotation "index:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}" + --annotation "index:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}" + ) + echo "Creating ${merged_tag} from ${refs[*]}" - docker buildx imagetools create --tag "${merged_tag}" "${refs[@]}" + docker buildx imagetools create "${annotations[@]}" --tag "${merged_tag}" "${refs[@]}" echo "Creating ${merged_versioned_tag} from ${refs[*]}" - docker buildx imagetools create --tag "${merged_versioned_tag}" "${refs[@]}" + docker buildx imagetools create "${annotations[@]}" --tag "${merged_versioned_tag}" "${refs[@]}" } for tag in $TAGS; do From b7340443d4cf6f89e8616de87ec6b3557055e581 Mon Sep 17 00:00:00 2001 From: Pranav Dhinakar Date: Mon, 18 May 2026 13:39:36 -0700 Subject: [PATCH 06/33] ggml-hexagon: add PAD op HVX kernel (#23078) * ggml-hexagon: add PAD op HVX kernel Implements GGML_OP_PAD on the Hexagon HTP backend using HVX vectorized kernels. Supports zero-padding and circular padding across all 4 tensor dimensions. * hex-ggml: remove duplicate op cases (merge conflict) * hex-pad: fix editorconfig checks and macro alignment --------- Co-authored-by: Max Krasnyansky --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 18 + ggml/src/ggml-hexagon/htp/CMakeLists.txt | 1 + ggml/src/ggml-hexagon/htp/htp-ctx.h | 1 + ggml/src/ggml-hexagon/htp/htp-ops.h | 1 + ggml/src/ggml-hexagon/htp/main.c | 3 + ggml/src/ggml-hexagon/htp/pad-ops.c | 545 +++++++++++++++++++++++ 6 files changed, 569 insertions(+) create mode 100644 ggml/src/ggml-hexagon/htp/pad-ops.c diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 3d1c9da83..c24a2305e 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2744,6 +2744,18 @@ static bool ggml_hexagon_supported_ssm_conv(const struct ggml_hexagon_session * return true; } +static bool ggml_hexagon_supported_pad(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) { + const struct ggml_tensor * src0 = op->src[0]; + const struct ggml_tensor * dst = op; + + if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) { + return false; + } + + GGML_UNUSED(sess); + return true; +} + static bool ggml_hexagon_supported_cumsum(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) { const struct ggml_tensor * src0 = op->src[0]; const struct ggml_tensor * dst = op; @@ -2857,6 +2869,8 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) { case GGML_OP_FILL: return HTP_OP_FILL; case GGML_OP_DIAG: return HTP_OP_DIAG; case GGML_OP_SOLVE_TRI: return HTP_OP_SOLVE_TRI; + case GGML_OP_PAD: return HTP_OP_PAD; + case GGML_OP_UNARY: switch (ggml_get_unary_op(t)) { case GGML_UNARY_OP_SILU: return HTP_OP_UNARY_SILU; @@ -3416,6 +3430,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons supp = ggml_hexagon_supported_solve_tri(sess, op); break; + case GGML_OP_PAD: + supp = ggml_hexagon_supported_pad(sess, op); + break; + default: break; } diff --git a/ggml/src/ggml-hexagon/htp/CMakeLists.txt b/ggml/src/ggml-hexagon/htp/CMakeLists.txt index bcadac11f..36f923243 100644 --- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt +++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt @@ -38,6 +38,7 @@ add_library(${HTP_LIB} SHARED diag-ops.c solve-tri-ops.c gated-delta-net-ops.c + pad-ops.c ) target_compile_definitions(${HTP_LIB} PRIVATE diff --git a/ggml/src/ggml-hexagon/htp/htp-ctx.h b/ggml/src/ggml-hexagon/htp/htp-ctx.h index 92f02eac6..e500ce462 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ctx.h +++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h @@ -107,5 +107,6 @@ int op_fill(struct htp_ops_context * octx); int op_diag(struct htp_ops_context * octx); int op_solve_tri(struct htp_ops_context * octx); int op_gated_delta_net(struct htp_ops_context * octx); +int op_pad(struct htp_ops_context * octx); #endif /* HTP_CTX_H */ diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h index 98db864dd..985ded6f2 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ops.h +++ b/ggml/src/ggml-hexagon/htp/htp-ops.h @@ -86,6 +86,7 @@ enum htp_op_code { HTP_OP_SOLVE_TRI, HTP_OP_L2_NORM, HTP_OP_GATED_DELTA_NET, + HTP_OP_PAD, HTP_OP_INVALID }; diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c index 883a31d61..85569f072 100644 --- a/ggml/src/ggml-hexagon/htp/main.c +++ b/ggml/src/ggml-hexagon/htp/main.c @@ -595,6 +595,9 @@ static int execute_op(struct htp_ops_context * octx) { case HTP_OP_SOLVE_TRI: return op_solve_tri(octx); + case HTP_OP_PAD: + return op_pad(octx); + case HTP_OP_GATED_DELTA_NET: return op_gated_delta_net(octx); diff --git a/ggml/src/ggml-hexagon/htp/pad-ops.c b/ggml/src/ggml-hexagon/htp/pad-ops.c new file mode 100644 index 000000000..3abc3c2ea --- /dev/null +++ b/ggml/src/ggml-hexagon/htp/pad-ops.c @@ -0,0 +1,545 @@ +#pragma clang diagnostic ignored "-Wunused-variable" +#pragma clang diagnostic ignored "-Wunused-function" +#pragma clang diagnostic ignored "-Wunused-but-set-variable" + +#include +#include + +#include + +#include "hex-dma.h" +#include "hvx-utils.h" + +#define GGML_COMMON_DECL_C +#include "ggml-common.h" +#include "htp-ctx.h" +#include "htp-ops.h" + +/* Circular wrap: maps any integer x into [0, n) */ +static inline uint32_t wrap_around(int32_t x, uint32_t n) { + return (uint32_t)(((x % (int32_t)n) + (int32_t)n) % (int32_t)n); +} + +/* Decompose a flat dst row index into (i1, i2, i3) */ +static inline void pad_decompose_row(uint32_t ir, uint32_t ne1, uint32_t ne2, + uint32_t *i1, uint32_t *i2, uint32_t *i3) { + *i1 = ir % ne1; + *i2 = (ir / ne1) % ne2; + *i3 = ir / (ne1 * ne2); +} + +/* Return non-zero if row (i1,i2,i3) falls in the non-padded interior */ +static inline int pad_is_interior(uint32_t i1, uint32_t i2, uint32_t i3, + int32_t lp1, int32_t rp1, uint32_t ne1, + int32_t lp2, int32_t rp2, uint32_t ne2, + int32_t lp3, int32_t rp3, uint32_t ne3) { + return ((int32_t)i1 >= lp1 && (int32_t)i1 < (int32_t)ne1 - rp1) && + ((int32_t)i2 >= lp2 && (int32_t)i2 < (int32_t)ne2 - rp2) && + ((int32_t)i3 >= lp3 && (int32_t)i3 < (int32_t)ne3 - rp3); +} + +/* Compute the DDR src row pointer for a zero-pad interior row */ +static inline const uint8_t * pad_src_row_ptr(const struct htp_tensor * src, + uint32_t i1, uint32_t i2, uint32_t i3, + int32_t lp1, int32_t lp2, int32_t lp3) { + return (const uint8_t *) src->data + + (i1 - (uint32_t)lp1) * src->nb[1] + + (i2 - (uint32_t)lp2) * src->nb[2] + + (i3 - (uint32_t)lp3) * src->nb[3]; +} + +/* Compute the DDR src row pointer for a circular row (wrap-around indexing) */ +static inline const uint8_t * pad_circ_src_row_ptr(const struct htp_tensor * src, + uint32_t i1, uint32_t i2, uint32_t i3, + int32_t lp1, int32_t lp2, int32_t lp3) { + return (const uint8_t *) src->data + + wrap_around((int32_t)i1 - lp1, src->ne[1]) * src->nb[1] + + wrap_around((int32_t)i2 - lp2, src->ne[2]) * src->nb[2] + + wrap_around((int32_t)i3 - lp3, src->ne[3]) * src->nb[3]; +} + +struct htp_pad_context { + struct htp_ops_context * octx; + + int32_t lp0, rp0; + int32_t lp1, rp1; + int32_t lp2, rp2; + int32_t lp3, rp3; + + uint32_t nrows_per_thread; + uint32_t total_dst_rows; + + size_t type_size; + + // Row sizes for DMA kernel (populated when VTCM is available) + size_t src_row_size; + size_t src_row_size_aligned; + size_t dst_row_size; + size_t dst_row_size_aligned; +}; + +#define htp_pad_preamble \ + const struct htp_tensor * src = octx->src[0]; \ + const struct htp_tensor * dst = octx->dst; \ + \ + const uint32_t ne00 = src->ne[0]; \ + const uint32_t nb00 = src->nb[0]; \ + \ + const uint32_t ne0 = dst->ne[0]; \ + const uint32_t ne1 = dst->ne[1]; \ + const uint32_t ne2 = dst->ne[2]; \ + const uint32_t ne3 = dst->ne[3]; \ + \ + const uint32_t nb1 = dst->nb[1]; \ + const uint32_t nb2 = dst->nb[2]; \ + const uint32_t nb3 = dst->nb[3]; \ + \ + const int32_t lp0 = pctx->lp0, rp0 = pctx->rp0; \ + const int32_t lp1 = pctx->lp1, rp1 = pctx->rp1; \ + const int32_t lp2 = pctx->lp2, rp2 = pctx->rp2; \ + const int32_t lp3 = pctx->lp3, rp3 = pctx->rp3; \ + \ + const size_t type_size = pctx->type_size; \ + \ + const uint32_t row_start = pctx->nrows_per_thread * ith; \ + const uint32_t row_end = MIN(row_start + pctx->nrows_per_thread, pctx->total_dst_rows); + + +#define htp_pad_dma_preamble \ + const size_t src_row_size = pctx->src_row_size; \ + const size_t src_row_size_aligned = pctx->src_row_size_aligned; \ + const size_t dst_row_size = pctx->dst_row_size; \ + const size_t dst_row_size_aligned = pctx->dst_row_size_aligned; \ + \ + uint8_t * src_spad_base = octx->src0_spad.data + ith * octx->src0_spad.size_per_thread; \ + uint8_t * dst_spad_base = octx->dst_spad.data + ith * octx->dst_spad.size_per_thread; \ + \ + dma_queue * dma = octx->ctx->dma[ith]; + +// --------------------------------------------------------------------------- +// HVX vectorized PAD kernel +// --------------------------------------------------------------------------- + +static void pad_job_per_thread_hvx(unsigned int nth, unsigned int ith, void * data) { + const struct htp_pad_context * pctx = (const struct htp_pad_context *) data; + struct htp_ops_context * octx = pctx->octx; + htp_pad_preamble; + + uint64_t t1, t2; + t1 = HAP_perf_get_qtimer_count(); + + for (uint32_t dst_row = row_start; dst_row < row_end; dst_row++) { + uint32_t i1, i2, i3; + pad_decompose_row(dst_row, ne1, ne2, &i1, &i2, &i3); + + uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3; + + const int interior = pad_is_interior(i1, i2, i3, + lp1, rp1, ne1, + lp2, rp2, ne2, + lp3, rp3, ne3); + + if (!interior) { + hvx_splat_f32_u(dst_ptr, 0.0f, ne0); + } else { + const uint8_t * src_ptr = pad_src_row_ptr(src, i1, i2, i3, lp1, lp2, lp3); + + if (lp0 > 0) { + hvx_splat_f32_u(dst_ptr, 0.0f, (uint32_t)lp0); + } + + uint8_t * dst_row_start = dst_ptr + (size_t)lp0 * type_size; + if (nb00 == type_size) { + hvx_copy_f32_uu(dst_row_start, src_ptr, ne00); + } else { + for (uint32_t i = 0; i < ne00; i++) { + memcpy(dst_row_start + i * type_size, + src_ptr + (size_t)i * nb00, + type_size); + } + } + + if (rp0 > 0) { + hvx_splat_f32_u(dst_ptr + ((size_t)lp0 + ne00) * type_size, 0.0f, (uint32_t)rp0); + } + } + } + + t2 = HAP_perf_get_qtimer_count(); + + FARF(HIGH, "pad-hvx %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n", + ith, nth, + src->ne[0], src->ne[1], src->ne[2], src->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + row_start, row_end, + (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); +} + +// --------------------------------------------------------------------------- +// HVX + DMA PAD kernel — aligned, double-buffered +// --------------------------------------------------------------------------- + +static void pad_job_per_thread_hvx_dma(unsigned int nth, unsigned int ith, void * data) { + const struct htp_pad_context * pctx = (const struct htp_pad_context *) data; + struct htp_ops_context * octx = pctx->octx; + htp_pad_preamble; + htp_pad_dma_preamble; + + uint64_t t1, t2; + t1 = HAP_perf_get_qtimer_count(); + + // ----------------------------------------------------------------------- + // Priming phase: push 2 pairs of (dummy_dst_DMA, src_DMA) to seed the + // double-buffer pipeline before the main loop begins. + // ----------------------------------------------------------------------- + for (uint32_t ir = row_start, spad_idx = 0; ir < row_end && spad_idx < 2; ir++, spad_idx++) { + uint8_t * src_spad_cur = src_spad_base + spad_idx * src_row_size_aligned; + uint8_t * dst_spad_cur = dst_spad_base + spad_idx * dst_row_size_aligned; + + dma_queue_push_vtcm_to_ddr(dma, + dma_make_ptr((uint8_t *)dst->data, dst_spad_cur), + dst_row_size, dst_row_size_aligned, 0); + + uint32_t i1, i2, i3; + pad_decompose_row(ir, ne1, ne2, &i1, &i2, &i3); + const int interior = pad_is_interior(i1, i2, i3, + lp1, rp1, ne1, + lp2, rp2, ne2, + lp3, rp3, ne3); + + const uint8_t * src_ptr = interior + ? pad_src_row_ptr(src, i1, i2, i3, lp1, lp2, lp3) : NULL; + + // Interior row: real DMA (1 row) from DDR to VTCM. + // Border row: null DMA (nrows=0) + dma_queue_push_ddr_to_vtcm(dma, + dma_make_ptr(src_spad_cur, + src_ptr ? src_ptr : (const uint8_t *)src_spad_cur), + src_row_size_aligned, src_row_size, src_ptr ? 1 : 0); + } + + // ----------------------------------------------------------------------- + // Main loop: pop completed DMAs, compute in VTCM with aligned HVX ops, + // push dst DMA and prefetch src for the next+1 row. + // ----------------------------------------------------------------------- + for (uint32_t ir = row_start; ir < row_end; ir++) { + uint8_t * dst_spad_cur = (uint8_t *) dma_queue_pop(dma).src; + uint8_t * src_spad_cur = (uint8_t *) dma_queue_pop(dma).dst; + + uint32_t i1, i2, i3; + pad_decompose_row(ir, ne1, ne2, &i1, &i2, &i3); + + uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3; + + const int interior = pad_is_interior(i1, i2, i3, + lp1, rp1, ne1, + lp2, rp2, ne2, + lp3, rp3, ne3); + + if (!interior) { + hvx_splat_f32_a(dst_spad_cur, 0.0f, ne0); + } else { + hvx_splat_f32_a(dst_spad_cur, 0.0f, ne0); + + uint8_t * dst_interior = dst_spad_cur + (size_t)lp0 * type_size; + + if ((uintptr_t)dst_interior % VLEN == 0) { + hvx_copy_f32_aa(dst_interior, src_spad_cur, ne00); + } else { + hvx_copy_f32_ua(dst_interior, src_spad_cur, ne00); + } + } + + dma_queue_push_vtcm_to_ddr(dma, + dma_make_ptr(dst_ptr, dst_spad_cur), + dst_row_size, dst_row_size_aligned, 1); + + const uint32_t next_row = ir + 2; + if (next_row < row_end) { + uint32_t ni1, ni2, ni3; + pad_decompose_row(next_row, ne1, ne2, &ni1, &ni2, &ni3); + const int next_interior = pad_is_interior(ni1, ni2, ni3, + lp1, rp1, ne1, + lp2, rp2, ne2, + lp3, rp3, ne3); + const uint8_t * next_src_ptr = next_interior + ? pad_src_row_ptr(src, ni1, ni2, ni3, lp1, lp2, lp3) : NULL; + + dma_queue_push_ddr_to_vtcm(dma, + dma_make_ptr(src_spad_cur, + next_src_ptr ? next_src_ptr : (const uint8_t *)src_spad_cur), + src_row_size_aligned, src_row_size, next_src_ptr ? 1 : 0); + } + } + + dma_queue_flush(dma); + + t2 = HAP_perf_get_qtimer_count(); + + FARF(HIGH, "pad-hvx-dma %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n", + ith, nth, + src->ne[0], src->ne[1], src->ne[2], src->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + row_start, row_end, + (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); +} + +// --------------------------------------------------------------------------- +// HVX circular PAD kernel +// --------------------------------------------------------------------------- + +static void pad_job_per_thread_hvx_circular(unsigned int nth, unsigned int ith, void * data) { + const struct htp_pad_context * pctx = (const struct htp_pad_context *) data; + struct htp_ops_context * octx = pctx->octx; + htp_pad_preamble; + + uint64_t t1, t2; + t1 = HAP_perf_get_qtimer_count(); + + for (uint32_t dst_row = row_start; dst_row < row_end; dst_row++) { + uint32_t i1, i2, i3; + pad_decompose_row(dst_row, ne1, ne2, &i1, &i2, &i3); + + uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3; + const uint8_t * src_row = pad_circ_src_row_ptr(src, i1, i2, i3, lp1, lp2, lp3); + + if (nb00 == type_size) { + + if (lp0 > 0) { + if ((uint32_t)lp0 < 32) { + memcpy(dst_ptr, + src_row + (size_t)(ne00 - (uint32_t)lp0) * type_size, + (size_t)lp0 * type_size); + } else { + hvx_copy_f32_uu(dst_ptr, + src_row + (size_t)(ne00 - (uint32_t)lp0) * type_size, + (uint32_t)lp0); + } + } + hvx_copy_f32_uu(dst_ptr + (size_t)lp0 * type_size, src_row, ne00); + if (rp0 > 0) { + if ((uint32_t)rp0 < 32) { + memcpy(dst_ptr + ((size_t)lp0 + ne00) * type_size, + src_row, + (size_t)rp0 * type_size); + } else { + hvx_copy_f32_uu(dst_ptr + ((size_t)lp0 + ne00) * type_size, + src_row, + (uint32_t)rp0); + } + } + } else { + for (uint32_t i = 0; i < (uint32_t)lp0; i++) { + *(float *)(dst_ptr + i * type_size) = + *(const float *)(src_row + (size_t)(ne00 - (uint32_t)lp0 + i) * nb00); + } + for (uint32_t i = 0; i < ne00; i++) { + *(float *)(dst_ptr + ((size_t)lp0 + i) * type_size) = + *(const float *)(src_row + (size_t)i * nb00); + } + for (uint32_t i = 0; i < (uint32_t)rp0; i++) { + *(float *)(dst_ptr + ((size_t)lp0 + ne00 + i) * type_size) = + *(const float *)(src_row + (size_t)i * nb00); + } + } + } + + t2 = HAP_perf_get_qtimer_count(); + + FARF(HIGH, "pad-hvx-circ %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n", + ith, nth, + src->ne[0], src->ne[1], src->ne[2], src->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + row_start, row_end, + (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); +} + +// --------------------------------------------------------------------------- +// HVX + DMA circular PAD kernel — aligned, double-buffered +// --------------------------------------------------------------------------- + +static void pad_job_per_thread_hvx_circular_dma(unsigned int nth, unsigned int ith, void * data) { + const struct htp_pad_context * pctx = (const struct htp_pad_context *) data; + struct htp_ops_context * octx = pctx->octx; + htp_pad_preamble; + htp_pad_dma_preamble; + + uint64_t t1, t2; + t1 = HAP_perf_get_qtimer_count(); + + // ----------------------------------------------------------------------- + // Priming phase: push 2 pairs of (dummy_dst_DMA, src_DMA) to seed the + // double-buffer pipeline. Every row is a real src DMA (no null DMAs). + // ----------------------------------------------------------------------- + for (uint32_t ir = row_start, spad_idx = 0; ir < row_end && spad_idx < 2; ir++, spad_idx++) { + uint8_t * src_spad_cur = src_spad_base + spad_idx * src_row_size_aligned; + uint8_t * dst_spad_cur = dst_spad_base + spad_idx * dst_row_size_aligned; + + dma_queue_push_vtcm_to_ddr(dma, + dma_make_ptr((uint8_t *)dst->data, dst_spad_cur), + dst_row_size, dst_row_size_aligned, 0); + + uint32_t pi1, pi2, pi3; + pad_decompose_row(ir, ne1, ne2, &pi1, &pi2, &pi3); + dma_queue_push_ddr_to_vtcm(dma, + dma_make_ptr(src_spad_cur, pad_circ_src_row_ptr(src, pi1, pi2, pi3, lp1, lp2, lp3)), + src_row_size_aligned, src_row_size, 1); + } + + // ----------------------------------------------------------------------- + // Main loop: pop completed DMAs, assemble circular row in VTCM with + // aligned HVX ops, push dst DMA and prefetch src for the next+1 row. + // ----------------------------------------------------------------------- + for (uint32_t ir = row_start; ir < row_end; ir++) { + uint8_t * dst_spad_cur = (uint8_t *) dma_queue_pop(dma).src; + uint8_t * src_spad_cur = (uint8_t *) dma_queue_pop(dma).dst; + + uint32_t i1, i2, i3; + pad_decompose_row(ir, ne1, ne2, &i1, &i2, &i3); + uint8_t * dst_ptr = (uint8_t *) dst->data + i1 * nb1 + i2 * nb2 + i3 * nb3; + + + if (lp0 > 0) { + uint8_t * dst_left = dst_spad_cur; + const uint8_t * src_left = src_spad_cur + (size_t)(ne00 - (uint32_t)lp0) * type_size; + if ((uint32_t)lp0 < 32) { + memcpy(dst_left, src_left, (size_t)lp0 * type_size); + } else { + hvx_copy_f32_uu(dst_left, src_left, (uint32_t)lp0); + } + } + + { + uint8_t * dst_mid = dst_spad_cur + (size_t)lp0 * type_size; + if ((uintptr_t)dst_mid % VLEN == 0) { + hvx_copy_f32_aa(dst_mid, src_spad_cur, ne00); + } else { + hvx_copy_f32_ua(dst_mid, src_spad_cur, ne00); + } + } + + if (rp0 > 0) { + uint8_t * dst_right = dst_spad_cur + ((size_t)lp0 + ne00) * type_size; + if ((uint32_t)rp0 < 32) { + memcpy(dst_right, src_spad_cur, (size_t)rp0 * type_size); + } else { + if ((uintptr_t)dst_right % VLEN == 0) { + hvx_copy_f32_aa(dst_right, src_spad_cur, (uint32_t)rp0); + } else { + hvx_copy_f32_ua(dst_right, src_spad_cur, (uint32_t)rp0); + } + } + } + + dma_queue_push_vtcm_to_ddr(dma, + dma_make_ptr(dst_ptr, dst_spad_cur), + dst_row_size, dst_row_size_aligned, 1); + + const uint32_t next_row = ir + 2; + if (next_row < row_end) { + uint32_t nri1, nri2, nri3; + pad_decompose_row(next_row, ne1, ne2, &nri1, &nri2, &nri3); + dma_queue_push_ddr_to_vtcm(dma, + dma_make_ptr(src_spad_cur, + pad_circ_src_row_ptr(src, nri1, nri2, nri3, lp1, lp2, lp3)), + src_row_size_aligned, src_row_size, 1); + } + } + + dma_queue_flush(dma); + + t2 = HAP_perf_get_qtimer_count(); + + FARF(HIGH, "pad-hvx-circ-dma %d/%d: (%ux%ux%ux%u) -> (%ux%ux%ux%u) rows %u:%u usec %u\n", + ith, nth, + src->ne[0], src->ne[1], src->ne[2], src->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + row_start, row_end, + (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); +} + +int op_pad(struct htp_ops_context * octx) { + const struct htp_tensor * src0 = octx->src[0]; + const struct htp_tensor * dst = octx->dst; + + // Only F32 supported + size_t type_size; + switch (src0->type) { + case HTP_TYPE_F32: type_size = 4; break; + default: + FARF(ERROR, "pad-hvx: unsupported type %u\n", src0->type); + return HTP_STATUS_NO_SUPPORT; + } + + if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) { + return HTP_STATUS_OK; + } + + const int32_t lp0 = octx->op_params[0]; + const int32_t rp0 = octx->op_params[1]; + const int32_t lp1 = octx->op_params[2]; + const int32_t rp1 = octx->op_params[3]; + const int32_t lp2 = octx->op_params[4]; + const int32_t rp2 = octx->op_params[5]; + const int32_t lp3 = octx->op_params[6]; + const int32_t rp3 = octx->op_params[7]; + const int32_t circular = octx->op_params[8]; + + const uint32_t ne0 = dst->ne[0]; + const uint32_t ne00 = src0->ne[0]; + + const uint32_t total_dst_rows = dst->ne[1] * dst->ne[2] * dst->ne[3]; + const uint32_t n_threads = MIN(octx->n_threads, total_dst_rows > 0 ? total_dst_rows : 1); + + const size_t src_row_size = (size_t)ne00 * type_size; + const size_t dst_row_size = (size_t)ne0 * type_size; + const size_t src_row_size_aligned = hex_round_up(src_row_size, VLEN); + const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN); + + // Total VTCM needed: 2 buffers (ping+pong) for src and dst, per thread + const size_t vtcm_needed = (size_t)n_threads * 2 * (src_row_size_aligned + dst_row_size_aligned); + + const int use_dma = (src0->nb[0] == (uint32_t)type_size) && + (ne00 >= 512) && + (octx->ctx->vtcm_base != NULL) && + (octx->ctx->vtcm_size >= vtcm_needed); + + if (use_dma) { + octx->src0_spad.size_per_thread = 2 * src_row_size_aligned; + octx->dst_spad.size_per_thread = 2 * dst_row_size_aligned; + octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread; + octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread; + octx->src0_spad.data = octx->ctx->vtcm_base; + octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size; + } + + struct htp_pad_context pctx = { + .octx = octx, + .lp0 = lp0, .rp0 = rp0, + .lp1 = lp1, .rp1 = rp1, + .lp2 = lp2, .rp2 = rp2, + .lp3 = lp3, .rp3 = rp3, + .nrows_per_thread = (total_dst_rows + n_threads - 1) / n_threads, + .total_dst_rows = total_dst_rows, + .type_size = type_size, + .src_row_size = src_row_size, + .src_row_size_aligned = src_row_size_aligned, + .dst_row_size = dst_row_size, + .dst_row_size_aligned = dst_row_size_aligned, + }; + + FARF(HIGH, "pad-hvx%s%s: (%ux%ux%ux%u) -> (%ux%ux%ux%u) pads=(%d,%d,%d,%d,%d,%d,%d,%d)\n", + circular ? "-circ" : "", + use_dma ? "-dma" : "", + src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3); + + if (circular && use_dma) { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx_circular_dma, &pctx, n_threads); } + else if (circular) { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx_circular, &pctx, n_threads); } + else if (use_dma) { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx_dma, &pctx, n_threads); } + else { worker_pool_run_func(octx->ctx->worker_pool, pad_job_per_thread_hvx, &pctx, n_threads); } + + return HTP_STATUS_OK; +} + From 9a532ae4bab1b164052ce60a738f78538b421c66 Mon Sep 17 00:00:00 2001 From: Pranav Dhinakar Date: Mon, 18 May 2026 14:04:57 -0700 Subject: [PATCH 07/33] hexagon: add support for TRI op (#22822) * Hexagon: TRI HVX Kernel addition to ggml hexagon HTP ops and context * addressed PR review comments for TRI op * hexagon: clang format * hex-unary: remove merge conflict markers * hex-ggml: remove duplicate op cases (merge conflict) * hex-ggml: fix editor config errors --------- Co-authored-by: Todor Boinovski Co-authored-by: Max Krasnyansky --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 20 +++++ ggml/src/ggml-hexagon/htp/htp-ctx.h | 1 + ggml/src/ggml-hexagon/htp/htp-ops.h | 1 + ggml/src/ggml-hexagon/htp/main.c | 3 + ggml/src/ggml-hexagon/htp/unary-ops.c | 113 ++++++++++++++++++++++++- 5 files changed, 137 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index c24a2305e..2f75e97ac 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2828,6 +2828,21 @@ static bool ggml_hexagon_supported_solve_tri(const struct ggml_hexagon_session * return true; } +static bool ggml_hexagon_supported_tri(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) { + + const struct ggml_tensor * src0 = op->src[0]; + const struct ggml_tensor * dst = op; + + if (src0->type != GGML_TYPE_F32) { return false; } + if (dst->type != GGML_TYPE_F32) { return false; } + if (!ggml_are_same_shape(src0, dst)) { return false; } + if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) { return false; } + + return true; + + GGML_UNUSED(sess); +} + static const char * ggml_backend_hexagon_name(ggml_backend_t backend) { auto sess = static_cast(backend->context); return sess->c_name(); @@ -2869,6 +2884,7 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) { case GGML_OP_FILL: return HTP_OP_FILL; case GGML_OP_DIAG: return HTP_OP_DIAG; case GGML_OP_SOLVE_TRI: return HTP_OP_SOLVE_TRI; + case GGML_OP_TRI: return HTP_OP_TRI; case GGML_OP_PAD: return HTP_OP_PAD; case GGML_OP_UNARY: @@ -3430,6 +3446,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons supp = ggml_hexagon_supported_solve_tri(sess, op); break; + case GGML_OP_TRI: + supp = ggml_hexagon_supported_tri(sess, op); + break; + case GGML_OP_PAD: supp = ggml_hexagon_supported_pad(sess, op); break; diff --git a/ggml/src/ggml-hexagon/htp/htp-ctx.h b/ggml/src/ggml-hexagon/htp/htp-ctx.h index e500ce462..6fe3e6c7d 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ctx.h +++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h @@ -107,6 +107,7 @@ int op_fill(struct htp_ops_context * octx); int op_diag(struct htp_ops_context * octx); int op_solve_tri(struct htp_ops_context * octx); int op_gated_delta_net(struct htp_ops_context * octx); +int op_tri(struct htp_ops_context * octx); int op_pad(struct htp_ops_context * octx); #endif /* HTP_CTX_H */ diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h index 985ded6f2..676e948a4 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ops.h +++ b/ggml/src/ggml-hexagon/htp/htp-ops.h @@ -86,6 +86,7 @@ enum htp_op_code { HTP_OP_SOLVE_TRI, HTP_OP_L2_NORM, HTP_OP_GATED_DELTA_NET, + HTP_OP_TRI, HTP_OP_PAD, HTP_OP_INVALID diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c index 85569f072..12003c1fd 100644 --- a/ggml/src/ggml-hexagon/htp/main.c +++ b/ggml/src/ggml-hexagon/htp/main.c @@ -601,6 +601,9 @@ static int execute_op(struct htp_ops_context * octx) { case HTP_OP_GATED_DELTA_NET: return op_gated_delta_net(octx); + case HTP_OP_TRI: + return op_tri(octx); + case HTP_OP_INVALID: break; diff --git a/ggml/src/ggml-hexagon/htp/unary-ops.c b/ggml/src/ggml-hexagon/htp/unary-ops.c index d4ae89ee6..1ce881353 100644 --- a/ggml/src/ggml-hexagon/htp/unary-ops.c +++ b/ggml/src/ggml-hexagon/htp/unary-ops.c @@ -17,7 +17,6 @@ #include "ggml-common.h" #include "htp-ctx.h" #include "htp-ops.h" -#include "htp-ops.h" struct htp_unary_context { struct htp_ops_context * octx; @@ -277,6 +276,95 @@ static void sigmoid_f32(const float * restrict src, } } +static void tri_f32(const float * restrict src, + float * restrict dst, + uint8_t * restrict spad, + const uint32_t num_rows, + const uint32_t row_elems, + const size_t row_size, + int32_t * op_params, + const uint32_t ir, + const struct htp_unary_context * uctx) { + + const int32_t ttype = op_params[0]; + const HVX_Vector zero = hvx_vec_splat_f32(0.0f); + const uint32_t nvec = row_elems / VLEN_FP32; + const uint32_t nloe = row_elems % VLEN_FP32; + + const uint32_t ne01 = uctx->octx->src[0]->ne[1]; + + for (uint32_t b = 0; b < num_rows; b++) { + const uint32_t abs_row = ir + b; + const uint32_t i01 = abs_row % ne01; + + const HVX_Vector * restrict v_src = (const HVX_Vector *) ((const uint8_t *) src + b * row_size); + HVX_Vector * restrict v_dst = (HVX_Vector *) ((uint8_t *) dst + b * row_size); + + uint32_t boundary; + int keep_left; + switch (ttype) { + case 0: boundary = i01; keep_left = 0; break; // keep col >= row + case 1: boundary = i01 + 1; keep_left = 0; break; // keep col > row + case 2: boundary = i01 + 1; keep_left = 1; break; // keep col <= row + case 3: boundary = i01; keep_left = 1; break; // keep col < row + default: boundary = 0; keep_left = 0; break; + } + if (boundary > row_elems) boundary = row_elems; + + // Full HVX vectors — each starts at a 128-byte aligned offset + for (uint32_t i = 0; i < nvec; i++) { + const uint32_t vec_start = i * VLEN_FP32; + const uint32_t vec_end = vec_start + VLEN_FP32; + if (keep_left) { + if (vec_end <= boundary) { + v_dst[i] = v_src[i]; + } else if (vec_start >= boundary) { + v_dst[i] = zero; + } else { + HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float)); + v_dst[i] = Q6_V_vmux_QVV(mask, v_src[i], zero); + } + } else { + if (vec_end <= boundary) { + v_dst[i] = zero; + } else if (vec_start >= boundary) { + v_dst[i] = v_src[i]; + } else { + HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float)); + v_dst[i] = Q6_V_vmux_QVV(mask, zero, v_src[i]); + } + } + } + + // Tail elements (row_elems not a multiple of VLEN_FP32) + if (nloe > 0) { + const uint32_t vec_start = nvec * VLEN_FP32; + const uint32_t vec_end = vec_start + nloe; + HVX_Vector tail_val; + if (keep_left) { + if (vec_end <= boundary) { + tail_val = v_src[nvec]; + } else if (vec_start >= boundary) { + tail_val = zero; + } else { + HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float)); + tail_val = Q6_V_vmux_QVV(mask, v_src[nvec], zero); + } + } else { + if (vec_end <= boundary) { + tail_val = zero; + } else if (vec_start >= boundary) { + tail_val = v_src[nvec]; + } else { + HVX_VectorPred mask = Q6_Q_vsetq_R((boundary - vec_start) * sizeof(float)); + tail_val = Q6_V_vmux_QVV(mask, zero, v_src[nvec]); + } + } + hvx_vec_store_a(&v_dst[nvec], nloe * sizeof(float), tail_val); + } + } +} + static void softplus_f32(const float * restrict src, float * restrict dst, uint8_t * restrict spad, @@ -498,6 +586,9 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * case HTP_OP_L2_NORM: l2_norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params); break; + case HTP_OP_TRI: + tri_f32(src0_spad, dst_spad, NULL, block_size, ne00, src0_row_size_aligned, op_params, ir, uctx); + break; default: break; } @@ -571,6 +662,10 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) { case HTP_OP_L2_NORM: op_type = "l2norm-f32"; break; + case HTP_OP_TRI: + op_type = "tri-f32"; + break; + default: FARF(ERROR, "Unsupported unary Op %u\n", octx->op); return HTP_STATUS_NO_SUPPORT; @@ -640,6 +735,22 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) { return err; } +int op_tri(struct htp_ops_context * octx) { + int err = HTP_STATUS_OK; + + switch (octx->src[0]->type) { + case HTP_TYPE_F32: + err = execute_op_unary_f32(octx); + break; + + default: + err = HTP_STATUS_NO_SUPPORT; + break; + } + + return err; +} + int op_unary(struct htp_ops_context * octx) { int err = HTP_STATUS_OK; From c3e9ade6dd3ff2a1ceafd2d59062634715b472c4 Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Tue, 19 May 2026 09:42:36 +0300 Subject: [PATCH 08/33] rpc : keep last_graph_uid in the device context (#23273) With the introduction of MTP we can have multiple compute contexts for the same RPC device. In this case last_graph_uid is not updated properly when contexts are being switched. This patch fixes this by moving last_graph_uid to the device context, making sure it is always updated. closes: #23242 --- ggml/src/ggml-rpc/ggml-rpc.cpp | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index 1cb8f563d..d38057721 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -199,6 +199,14 @@ static ggml_guid_t ggml_backend_rpc_guid() { return &guid; } +struct ggml_backend_rpc_device_context { + std::string endpoint; + uint32_t device; + std::string name; + std::string description; + uint64_t last_graph_uid; +}; + struct ggml_backend_rpc_buffer_type_context { std::string endpoint; uint32_t device; @@ -211,7 +219,6 @@ struct ggml_backend_rpc_context { std::string endpoint; uint32_t device; std::string name; - uint64_t last_graph_uid; }; struct ggml_backend_rpc_buffer_context { @@ -691,9 +698,11 @@ static void serialize_graph(uint32_t device, const ggml_cgraph * cgraph, std::ve static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context; + ggml_backend_dev_t rpc_dev = ggml_backend_get_device(backend); + ggml_backend_rpc_device_context * rpc_dev_ctx = (ggml_backend_rpc_device_context *)rpc_dev->context; GGML_ASSERT(cgraph->n_nodes > 0); - bool reuse = cgraph->uid != 0 && rpc_ctx->last_graph_uid == cgraph->uid; + bool reuse = cgraph->uid != 0 && rpc_dev_ctx->last_graph_uid == cgraph->uid; if (reuse) { rpc_msg_graph_recompute_req request; request.device = rpc_ctx->device; @@ -701,7 +710,7 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_RECOMPUTE, &request, sizeof(request)); RPC_STATUS_ASSERT(status); } else { - rpc_ctx->last_graph_uid = cgraph->uid; + rpc_dev_ctx->last_graph_uid = cgraph->uid; std::vector input; serialize_graph(rpc_ctx->device, cgraph, input); auto sock = get_socket(rpc_ctx->endpoint); @@ -770,7 +779,6 @@ ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device) { /* .endpoint = */ endpoint, /* .device = */ device, /* .name = */ dev_name, - /* .last_graph_uid = */ 0, }; auto reg = ggml_backend_rpc_add_server(endpoint); ggml_backend_t backend = new ggml_backend { @@ -1757,15 +1765,6 @@ void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir } } -// device interface - -struct ggml_backend_rpc_device_context { - std::string endpoint; - uint32_t device; - std::string name; - std::string description; -}; - static const char * ggml_backend_rpc_device_get_name(ggml_backend_dev_t dev) { ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context; @@ -1947,10 +1946,11 @@ ggml_backend_reg_t ggml_backend_rpc_add_server(const char * endpoint) { std::string dev_name = "RPC" + std::to_string(dev_id); std::string dev_desc = std::string(endpoint); ggml_backend_rpc_device_context * dev_ctx = new ggml_backend_rpc_device_context { - /* .endpoint = */ endpoint, - /* .device = */ ind, - /* .name = */ dev_name, - /* .description = */ dev_desc + /* .endpoint = */ endpoint, + /* .device = */ ind, + /* .name = */ dev_name, + /* .description = */ dev_desc, + /* .last_graph_uid = */ 0, }; ggml_backend_dev_t dev = new ggml_backend_device { From 439f1b193d2d7d8db4d2b70cbf63e3afcbb38df8 Mon Sep 17 00:00:00 2001 From: Intel AI Get-to Market Customer Success and Solutions Date: Mon, 18 May 2026 23:44:02 -0700 Subject: [PATCH 09/33] sycl: add GGML_SYCL_USE_ASYNC_MEM_OP env toggle (#22153) * sycl: add GGML_SYCL_USE_ASYNC_MEM_OP env toggle Signed-off-by: Chun Tao * Use async mem ops for correctness when SYCL graphs are explicitly on. Signed-off-by: Tao, Chun --------- Signed-off-by: Chun Tao Signed-off-by: Tao, Chun Co-authored-by: Chun Tao --- ggml/src/ggml-sycl/ggml-sycl.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index ebe7c5b35..2ea47f715 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -72,6 +72,7 @@ int g_ggml_sycl_disable_graph = 0; int g_ggml_sycl_disable_dnn = 0; int g_ggml_sycl_prioritize_dmmv = 0; int g_ggml_sycl_use_async_mem_op = 0; +int g_ggml_sycl_use_async_mem_op_requested = 1; int g_ggml_sycl_enable_level_zero = 0; int g_ggml_sycl_enable_flash_attention = 1; @@ -304,6 +305,8 @@ static void ggml_check_sycl() try { GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n"); #endif GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv); + g_ggml_sycl_use_async_mem_op_requested = get_sycl_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1); + GGML_LOG_INFO(" GGML_SYCL_USE_ASYNC_MEM_OP: %d\n", g_ggml_sycl_use_async_mem_op_requested); #ifdef SYCL_FLASH_ATTN GGML_LOG_INFO(" GGML_SYCL_ENABLE_FLASH_ATTN: %d\n", g_ggml_sycl_enable_flash_attention); @@ -319,11 +322,11 @@ static void ggml_check_sycl() try { fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__); #endif */ - // Currently, we only use async malloc / free when graphs are enabled as it is required for the calls to be - // properly recorded. As this SYCL extension matures it may be beneficial to enable as the default path and in - // other places. + // Async USM allocation/free is also useful outside the graph path: it avoids the host waits in the reorder + // staging path while preserving queue ordering semantics. Graph support still depends on the extension being + // available, but it no longer needs to control the non-graph fast path. #if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC - g_ggml_sycl_use_async_mem_op = !g_ggml_sycl_disable_graph; + g_ggml_sycl_use_async_mem_op = g_ggml_sycl_use_async_mem_op_requested || !g_ggml_sycl_disable_graph; if (g_ggml_sycl_use_async_mem_op) { for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); ++i) { if (!dpct::dev_mgr::instance().get_device(i).has(sycl::aspect::ext_oneapi_async_memory_alloc)) { From f1c1c5c057f047562b637db0ac7eac11485307bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Tue, 19 May 2026 08:44:25 +0200 Subject: [PATCH 10/33] convert : filter lora tensor names (#23077) --- convert_lora_to_gguf.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 1b7334617..81658ba03 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -445,6 +445,11 @@ if __name__ == '__main__': if self.lazy: tensor = LazyTorchTensor.from_eager(tensor) base_name = get_base_tensor_name(name) + # filter base name, ignore tensor transformations for now + data_gen = lambda g=tensor: g # noqa: E731 + if (titem := self.filter_tensors((base_name, data_gen))) is None: + continue + base_name, _ = titem # note: mergekit-extract-lora also adds token embeddings to the adapter is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name From aabee047d8ebf7abe2750585a347aa19feced3b5 Mon Sep 17 00:00:00 2001 From: Neo Zhang Date: Tue, 19 May 2026 14:44:51 +0800 Subject: [PATCH 11/33] [SCYL] add chapter for performance reference in SYCL.md (#23315) * add chapter for performance reference * rm unsupported GPU --- README.md | 2 +- docs/backend/SYCL.md | 13 ++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index a0c14b9d7..71327e514 100644 --- a/README.md +++ b/README.md @@ -280,7 +280,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo | [Metal](docs/build.md#metal-build) | Apple Silicon | | [BLAS](docs/build.md#blas-build) | All | | [BLIS](docs/backend/BLIS.md) | All | -| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU | +| [SYCL](docs/backend/SYCL.md) | Intel GPU | | [OpenVINO [In Progress]](docs/backend/OPENVINO.md) | Intel CPUs, GPUs, and NPUs | | [MUSA](docs/build.md#musa) | Moore Threads GPU | | [CUDA](docs/build.md#cuda) | Nvidia GPU | diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 155f933b8..0c4660b54 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -5,6 +5,7 @@ - [News](#news) - [OS](#os) - [Hardware](#hardware) +- [Performance Reference](#performance-reference) - [Docker](#docker) - [Linux](#linux) - [Windows](#windows) @@ -51,9 +52,8 @@ The packages for FP32 and FP16 would have different accuracy and performance on ## News -- 2026.04 - - - Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q_K, Q8_0. +- 2026.04-05 + - Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q6_K, Q8_0. - Fused MoE. - Upgrate CI and built package for oneAPI 2025.3.3, support Ubuntu 24.04 built package. @@ -150,6 +150,13 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the NA +## Performance Reference + + +To get the supported LLMs, GPUs, and performance reference, please check [Performance of llama.cpp on Intel GPU with SYCL backend](https://github.com/ggml-org/llama.cpp/discussions/23313). + +You could update your test result in it directly. + ## Docker The docker build option is currently limited to *Intel GPU* targets. From c85a242ed021ab6732e2973764437c3c5655102b Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Mon, 18 May 2026 23:45:41 -0700 Subject: [PATCH 12/33] ggml-webgpu : extend GDN for K>1 (#23299) --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 2 ++ .../wgsl-shaders/gated_delta_net.wgsl | 24 +++++++++++++++---- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 78cb02be0..921c12b41 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -1234,6 +1234,7 @@ static webgpu_encoded_op ggml_webgpu_gated_delta_net(webgpu_context & ctx, const uint32_t h = (uint32_t) src2->ne[1]; const uint32_t n_tokens = (uint32_t) src2->ne[2]; const uint32_t n_seqs = (uint32_t) src2->ne[3]; + const uint32_t K = (uint32_t) src5->ne[1]; const float scale = 1.0f / sqrtf((float) s_v); uint32_t scale_u32; memcpy(&scale_u32, &scale, sizeof(scale_u32)); @@ -1258,6 +1259,7 @@ static webgpu_encoded_op ggml_webgpu_gated_delta_net(webgpu_context & ctx, (uint32_t) src0->ne[1], (uint32_t) (src2->ne[3] / src0->ne[3]), + K, scale_u32, }; diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl index f9d98fda4..d68520f82 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/gated_delta_net.wgsl @@ -39,6 +39,7 @@ struct Params { neq1: u32, rq3: u32, + K: u32, scale: f32, }; @@ -62,11 +63,14 @@ fn main( let iq3 = seq_id / params.rq3; let state_size = S_V * S_V; - let state_base = (seq_id * params.h + head_id) * state_size; + let state_in_base = (seq_id * params.K * params.h + head_id) * state_size; + let state_out_base = (seq_id * params.h + head_id) * state_size; + let state_size_per_snap = state_size * params.h * params.n_seqs; + let shift = i32(params.n_tokens) - i32(params.K); var state: array; for (var i = 0u; i < S_V; i++) { - state[i] = src_state[state_base + col * S_V + i]; + state[i] = src_state[state_in_base + col * S_V + i]; } var attn_off = (seq_id * params.n_tokens * params.h + head_id) * S_V; @@ -123,10 +127,22 @@ fn main( dst[attn_off + col] = attn_col * params.scale; attn_off += S_V * params.h; + if (params.K > 1u) { + let target_slot = i32(t) - shift; + if (target_slot >= 0 && target_slot < i32(params.K)) { + let slot_base = params.s_off + u32(target_slot) * state_size_per_snap + state_out_base; + for (var i = 0u; i < S_V; i++) { + dst[slot_base + col * S_V + i] = state[i]; + } + } + } + workgroupBarrier(); } - for (var i = 0u; i < S_V; i++) { - dst[params.s_off + state_base + col * S_V + i] = state[i]; + if (params.K == 1u) { + for (var i = 0u; i < S_V; i++) { + dst[params.s_off + state_out_base + col * S_V + i] = state[i]; + } } } From d2e179a477fc1d1935b68422c1181ef2d62ed2ef Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 19 May 2026 09:46:05 +0300 Subject: [PATCH 13/33] llama-eval : add per-task summary stats (#23151) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * llama-eval : add per-problem summary table to HTML reports - Add chunk_idx and problem_idx to TaskState and saved case dicts - Group completed cases by problem_idx in dump_html() - Render per-problem summary table before individual task table - Columns: Problem (zero-padded), Runs, Correct (n/r), Tokens (min/avg/max), T/s (min/avg/max), Gen s (min/avg/max) - Sorted by problem index, monospace font, right-aligned numbers - Colspan headers for grouped stats, auto width - Simulator: add /v1/models endpoint, timings in response, template-aware question matching, --dataset arg (aime/aime2025) Assisted-by: llama.cpp:local pi * llama-eval : add tabs for Detailed and Summary tables, apply monospace font globally - Wrap Detailed and Summary tables in switchable tabs (Detailed active by default) - Remove summary-section wrapper, use tab labels instead - Apply monospace font to all tables and the top bar Assisted-by: llama.cpp:local pi * llama-eval : redesign top bar as CSS grid label/value pairs - Replace flat span list with 4-column grid layout (2 pairs per row) - Labels in muted color (#888), values in dark (#222) - Bold dataset name and model name - Removed media query, always uses 4 columns Assisted-by: llama.cpp:local pi * llama-eval : use realistic token counts and throughput in simulator - comp_tokens: [30, 80] → [10000, 60000] - tps_gen: derived → uniform [90.0, 110.0] - t_gen_ms: now computed from tokens/tps Assisted-by: llama.cpp:local pi * llama-eval : color Answer column green/red based on correctness Use the same .correct/.incorrect CSS classes on the Answer column to make correct answers green and incorrect answers red. Assisted-by: llama.cpp:local pi * llama-eval : fix pyright errors from max(..., key=len) type inference Use key=lambda x: len(x) instead of key=len so the type checker infers the return type as str instead of Sized, fixing: - unresolved-attribute: Object of type Sized has no attribute lower - not-subscriptable: Cannot subscript object of type Sized Assisted-by: llama.cpp:local pi --- examples/llama-eval/llama-eval.py | 189 ++++++++++++++---- examples/llama-eval/llama-server-simulator.py | 99 +++++++-- 2 files changed, 233 insertions(+), 55 deletions(-) diff --git a/examples/llama-eval/llama-eval.py b/examples/llama-eval/llama-eval.py index e833070ee..4bdd239c0 100755 --- a/examples/llama-eval/llama-eval.py +++ b/examples/llama-eval/llama-eval.py @@ -149,6 +149,8 @@ class TaskState: t_gen_ms: Optional[float] = None reasoning_content: Optional[str] = None server_name: Optional[str] = None + chunk_idx: int = 0 + problem_idx: int = 0 class EvalState: @@ -233,7 +235,9 @@ class EvalState: tps_gen: Optional[float] = None, t_gen_ms: Optional[float] = None, reasoning_content: Optional[str] = None, - server_name: Optional[str] = None + server_name: Optional[str] = None, + chunk_idx: int = 0, + problem_idx: int = 0, ): with self._lock: if "cases" not in self.task_states: @@ -252,7 +256,9 @@ class EvalState: "tps_gen": tps_gen, "t_gen_ms": t_gen_ms, "reasoning_content": reasoning_content, - "server_name": server_name + "server_name": server_name, + "chunk_idx": chunk_idx, + "problem_idx": problem_idx, } self.correct = sum(1 for c in self.task_states.get("cases", {}).values() if c.get("correct", False)) @@ -289,6 +295,9 @@ class EvalState: all_cases = {} for i, task_id in tasks_to_save: question_text, prompt, expected = self.get_case(i) + # Extract chunk_idx from task_id for pending cases + _parts = task_id.rsplit("_", 2) + _chunk_idx = int(_parts[-2]) if len(_parts) >= 3 else 0 if task_id in self.task_states.get("cases", {}): all_cases[task_id] = self.task_states["cases"][task_id] else: @@ -306,7 +315,9 @@ class EvalState: "tps_gen": None, "t_gen_ms": None, "reasoning_content": None, - "server_name": None + "server_name": None, + "chunk_idx": _chunk_idx, + "problem_idx": i, } ci_lower, ci_upper = self.accuracy_ci() @@ -382,11 +393,12 @@ class EvalState: grader_log_str = self._escape_html(json.dumps(grader_log, indent=2)) escaped_server = self._escape_html(server_name) + answer_class = status_class if status == "ok" else "" rows.append(f""" {task_id} {status_text} {self._escape_html(expected)} - {self._escape_html(answer)} + {self._escape_html(answer)} {tokens_str} {tps_str} {t_gen_str} @@ -405,6 +417,53 @@ class EvalState: rows_html = "\n".join(rows) + # ---- per-problem summary table ---- + problem_groups: Dict[int, List[Dict[str, Any]]] = {} + for _tid, _case in cases.items(): + if _case.get("status") != "ok": + continue + _pidx = _case.get("problem_idx") + if _pidx is None: + _p_parts = _tid.rsplit("_", 2) + _pidx = int(_p_parts[-1]) if len(_p_parts) >= 3 else 0 + problem_groups.setdefault(_pidx, []).append(_case) + + summary_rows_html = "" + if problem_groups: + def _stat(v, fmt=".1f", avg_fmt=None): + if not v: + return ("–", "–", "–") + af = fmt if avg_fmt is None else avg_fmt + return (f"{min(v):{fmt}}", f"{sum(v)/len(v):{af}}", f"{max(v):{fmt}}") + + summary_data = [] + for pidx, g in problem_groups.items(): + runs = len(g) + n_ok = sum(1 for c in g if c.get("correct", False)) + toks = [c["tokens"] for c in g if c.get("tokens") is not None] + tps = [c["tps_gen"] for c in g if c.get("tps_gen") is not None] + tg = [c["t_gen_ms"] / 1000 for c in g if c.get("t_gen_ms") is not None] + summary_data.append(( + pidx, runs, n_ok, + _stat(toks, "d", ".0f"), + _stat(tps), + _stat(tg), + )) + + summary_data.sort(key=lambda r: r[0]) # sort by problem index ascending + + summary_rows_html = "\n".join( + f""" + {p:03d} + {r} + {n}/{r} + {tk[0]}{tk[1]}{tk[2]} + {tp[0]}{tp[1]}{tp[2]} + {tg[0]}{tg[1]}{tg[2]} + """ + for p, r, n, tk, tp, tg in summary_data + ) + html_content = f""" @@ -412,10 +471,10 @@ class EvalState: {self.dataset_type.upper()} Eval
- {self.dataset_type.upper()} - Model: {self.model_name or 'N/A'} - Accuracy: {accuracy:.1f}% [{ci_lower*100:.1f}%, {ci_upper*100:.1f}%] - Correct: {n_correct} / {len(completed)} - Pending: {n_pending} - Time: {self.total_time:.1f}s - Sampling: {sampling_str} +
Dataset
{self.dataset_type.upper()}
+
Model
{self.model_name or 'N/A'}
+
Accuracy
{accuracy:.1f}% [{ci_lower*100:.1f}%, {ci_upper*100:.1f}%]
+
Correct
{n_correct} / {len(completed)}
+
Pending
{n_pending}
+
Time
{self.total_time:.1f}s
+
Sampling
{sampling_str}
+
+
+ + +
+
+ + + + + + + + + + + + + + + {rows_html} + +
IDGoldAnswerTokensT/sGen sServer
+
+
+ + + + + + + + + + + + + + + + + + + + + {summary_rows_html} + +
ProblemRunsCorrectTokensT/sGen s
minavgmaxminavgmaxminavgmax
- - - - - - - - - - - - - - - {rows_html} - -
IDGoldAnswerTokensT/sGen sServer
""" @@ -1062,12 +1172,19 @@ class Processor: ) -> TaskState: question_text, prompt, expected = eval_state.get_case(i) + # Extract chunk_idx from task_id: "{dataset_type}_{chunk_idx:03d}_{index:03d}" + _parts = task_id.rsplit("_", 2) + chunk_idx = int(_parts[-2]) if len(_parts) >= 3 else 0 + problem_idx = i + task_state = TaskState( task_id=task_id, prompt=prompt, expected=expected, question_text=question_text, - server_name=server_config.name + server_name=server_config.name, + chunk_idx=chunk_idx, + problem_idx=problem_idx, ) try: @@ -1085,7 +1202,8 @@ class Processor: eval_state.add_result( task_id, prompt, expected, result, None, {"finish_reason": finish_reason}, False, task_state.status, - tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name + tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name, + chunk_idx, problem_idx, ) eval_state.dump() return task_state @@ -1108,7 +1226,8 @@ class Processor: eval_state.add_result( task_id, prompt, expected, result, answer, grader_log, is_correct, "ok", - tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name + tokens, tps_gen, t_gen_ms, reasoning_content, server_config.name, + chunk_idx, problem_idx, ) eval_state.dump() diff --git a/examples/llama-eval/llama-server-simulator.py b/examples/llama-eval/llama-server-simulator.py index 2f9cdc545..e64ba8933 100755 --- a/examples/llama-eval/llama-server-simulator.py +++ b/examples/llama-eval/llama-server-simulator.py @@ -65,34 +65,70 @@ def normalize_number(s: str) -> Optional[int]: return int(match.group(0)) class AimeDataset: - def __init__(self, split: str = "train"): + def __init__(self, split: str = "train", dataset_type: str = "aime"): self.split = split + self.dataset_type = dataset_type self.questions: List[Dict] = [] self._load_dataset() - def _load_dataset(self): - print(f"Loading AIME dataset (split: {self.split})...") + def _get_question_text(self, question: Dict) -> str: + """Get question text, handling different dataset field names.""" + return question.get("problem", question.get("question", "")) - cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0" - if cache_path.exists(): - print(f"Using cached dataset from {cache_path}") - ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path)) + def _load_dataset(self): + if self.dataset_type == "aime": + print(f"Loading AIME dataset (split: {self.split})...") + cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0" + if cache_path.exists(): + print(f"Using cached dataset from {cache_path}") + ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path)) + else: + ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split) + elif self.dataset_type == "aime2025": + print(f"Loading AIME2025 dataset...") + ds_list = [] + for config_name in ["AIME2025-I", "AIME2025-II"]: + cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "opencompass___AIME2025" / "default" / "0.0.0" + if cache_path.exists(): + print(f"Using cached dataset from {cache_path}") + ds = datasets.load_dataset("opencompass/AIME2025", config_name, split="test", cache_dir=str(cache_path)) + else: + ds = datasets.load_dataset("opencompass/AIME2025", config_name, split="test") + ds_list.extend(ds) + ds = ds_list else: - ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split) + raise ValueError(f"Unknown dataset type: {self.dataset_type}") self.questions = list(ds) - print(f"AIME dataset loaded: {len(self.questions)} questions") + print(f"{self.dataset_type} dataset loaded: {len(self.questions)} questions") def find_question(self, request_text: str) -> Optional[Dict]: + # Strip common template prefixes to get the actual question text + # Templates include things like "Solve the following math problem step by step..." + # The actual question usually follows a blank line or after the template instruction + cleaned = request_text + # Split on double newline and take the part that looks like the problem + parts = cleaned.split('\n\n') + if len(parts) > 1: + # Find the part that's longest (likely the actual problem text) + problem_parts = [p for p in parts if len(p.strip()) > 100] + if problem_parts: + cleaned = max(problem_parts, key=lambda x: len(x)) + best_match = None best_distance = -1 best_index = -1 for i, question in enumerate(self.questions): - question_text = question["problem"] - request_lower = request_text.lower() + question_text = self._get_question_text(question) + request_lower = cleaned.lower() question_lower = question_text.lower() + # Check if question text is contained in the cleaned request + if question_lower in request_lower or request_lower in question_lower: + debug_log(f"DEBUG: Found substring match at index {i}") + return question + # Exact match if question_lower == request_lower: debug_log(f"DEBUG: Found exact match at index {i}") @@ -118,7 +154,7 @@ class AimeDataset: debug_log(f"DEBUG: Found best partial match at index {best_index} with distance {best_distance:.3f}") return best_match - debug_log(f"DEBUG: No matching question found for: {request_text[:100]}...") + debug_log(f"DEBUG: No matching question found for cleaned: {cleaned[:100]}...") return None def get_answer(self, question: Dict) -> str: @@ -134,15 +170,16 @@ class Simulator: port: int = 8033, host: str = "localhost", success_rate: float = 0.8, - dataset_split: str = "train" + dataset_split: str = "train", + dataset_type: str = "aime" ): self.port = port self.host = host self.success_rate = success_rate - self.dataset = AimeDataset(dataset_split) + self.dataset = AimeDataset(dataset_split, dataset_type) self.eval_state = EvalState( - id="aime-2025", - tasks=["aime"], + id=dataset_type, + tasks=[dataset_type], task_states={}, sampling_config={"temperature": 0, "max_tokens": 2048} ) @@ -159,6 +196,10 @@ class Simulator: else: response_text = self._generate_wrong_answer(question) + comp_tokens = random.randint(10000, 60000) + tps_gen = random.uniform(90.0, 110.0) + t_gen_ms = comp_tokens / tps_gen * 1000 + return { "id": f"chatcmpl-{int(time.time())}", "object": "chat.completion", @@ -176,8 +217,12 @@ class Simulator: ], "usage": { "prompt_tokens": 100, - "completion_tokens": 50, - "total_tokens": 150 + "completion_tokens": comp_tokens, + "total_tokens": 100 + comp_tokens + }, + "timings": { + "predicted_ms": t_gen_ms, + "predicted_per_second": tps_gen } } @@ -218,6 +263,12 @@ class Simulator: return response class RequestHandler(BaseHTTPRequestHandler): + def do_GET(self): + if self.path == "/v1/models": + self._send_json({"data": [{"id": "llama", "object": "model"}]}, 200) + return + self._send_json({"error": "Not found"}, 404) + def do_POST(self): if self.path != "/v1/chat/completions": self._send_json({"error": "Not found"}, 404) @@ -280,6 +331,13 @@ def main(): default=0.8, help="Success rate 0-1 (default: 0.8)" ) + parser.add_argument( + "--dataset", + type=str, + default="aime", + choices=["aime", "aime2025"], + help="Dataset type (default: aime)" + ) parser.add_argument( "--dataset-split", type=str, @@ -294,7 +352,8 @@ def main(): port=args.port, host=args.host, success_rate=args.success_rate, - dataset_split=args.dataset_split + dataset_split=args.dataset_split, + dataset_type=args.dataset ) server = HTTPServer((args.host, args.port), RequestHandler) @@ -304,7 +363,7 @@ def main(): print("\n=== llama-server-simulator ===") print(f"Server running on http://{args.host}:{args.port}") print(f"Success rate: {args.success_rate}") - print(f"AIME dataset loaded: {len(simulator.dataset.questions)} questions") + print(f"{args.dataset} dataset loaded: {len(simulator.dataset.questions)} questions") print("\nPress Ctrl+C to stop\n") try: From cd963fee6a86387d598ebe3888017376d6e9e8f6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 19 May 2026 09:46:34 +0300 Subject: [PATCH 14/33] save-load-state : refactor tests and improve readability (#23196) * save-load-state : refactor into separate phase functions - Split monolithic main() into 4 self-contained phase functions, each managing its own context/sampler/batch lifecycle - Each function tokenizes internally using its local ctx instance - main() is now a clean orchestrator: init -> run phases -> assert results - Proper resource cleanup on every exit path (return {} on error) Assisted-by: llama.cpp:local pi * save-load-state : use params.out_file instead of separate state_file - Remove state_file parameter from all phase functions - Each function accesses params.out_file directly - Initialize params.out_file in main alongside params.prompt Assisted-by: llama.cpp:local pi * save-load-state : use smart pointers for ctx and smpl - Replace raw llama_context* with llama_context_ptr - Replace raw llama_sampler* with llama_sampler_ptr - Remove all manual llama_free() and llama_sampler_free() calls - Keep llama_batch as raw (managed manually with llama_batch_free) Assisted-by: llama.cpp:local pi * save-load-state : add local llama_batch_ptr RAII wrapper - Add llama_batch_ptr struct holding llama_batch by value - Calls llama_batch_free() in destructor - Eliminates all manual llama_batch_free() calls Assisted-by: llama.cpp:local pi * save-load-state : replace printf/fprintf with logging macros - Add log.h include - Replace fprintf(stderr, ...) errors with LOG_ERR - Replace fprintf(stderr, ...) info with LOG_TRC - Replace printf output with LOG Assisted-by: llama.cpp:local pi * save-load-state : refactor tests to check results inline Each follow-up phase now accepts an expected result and performs the comparison internally instead of collecting results in main(). Assisted-by: llama.cpp:local pi * save-load-state : improve test output readability Add phase labels, remove redundant run prefixes, and show PASS after each test. Assisted-by: llama.cpp:local pi * pi : add rule about git signing * save-load-state : simplify llama_batch_ptr Change get() to return a reference and remove operator*(). Use batch.get() throughout for consistency. Assisted-by: llama.cpp:local pi * save-load-state : extract generate_tokens helper Factor out the repeated token generation loop into a shared helper function used by all phases. Assisted-by: llama.cpp:local pi * save-load-state : update comments to use test terminology Replace "Phase" with "Test" and list each test's steps as bullet points. Assisted-by: llama.cpp:local pi * save-load-state : rename test functions Rename to test_baseline, test_state_load, test_seq_cp_host, test_seq_cp_device. Update comments and logs accordingly. Assisted-by: llama.cpp:local pi * pi : add rule to never git push without confirmation Assisted-by: llama.cpp:local pi * common : add model_only option to common_init_from_params Add bool model_only parameter to skip context creation, sampler init, and context-dependent setup. Use in save-load-state to initialize only the model, with each test creating its own context. Assisted-by: llama.cpp:local pi --------- Co-authored-by: ggerganov --- .pi/gg/SYSTEM.md | 2 + common/common.cpp | 14 +- common/common.h | 4 +- examples/save-load-state/save-load-state.cpp | 563 ++++++++++--------- 4 files changed, 309 insertions(+), 274 deletions(-) diff --git a/.pi/gg/SYSTEM.md b/.pi/gg/SYSTEM.md index 727a850b1..b7597a4c3 100644 --- a/.pi/gg/SYSTEM.md +++ b/.pi/gg/SYSTEM.md @@ -22,6 +22,8 @@ Pull requests (PRs): Commits: - On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag - Do not explicitly set the git author in commits - rely on the default git config +- Always use `--no-gpg-sign` when committing +- Never `git push` without explicit confirmation from the user Resources (read on demand): - [CONTRIBUTING.md](CONTRIBUTING.md) diff --git a/common/common.cpp b/common/common.cpp index 9cf11ea9f..aef06263e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1160,7 +1160,7 @@ struct common_init_result::impl { std::vector samplers_seq_config; }; -common_init_result::common_init_result(common_params & params) : +common_init_result::common_init_result(common_params & params, bool model_only) : pimpl(new impl{}) { auto mparams = common_model_params_to_llama(params); auto cparams = common_context_params_to_llama(params); @@ -1183,6 +1183,10 @@ common_init_result::common_init_result(common_params & params) : pimpl->model.reset(model); + if (model_only) { + return; + } + const llama_vocab * vocab = llama_model_get_vocab(model); // load and optionally apply lora adapters @@ -1309,8 +1313,8 @@ std::vector & common_init_result::lora() { return pimpl->lora; } -common_init_result_ptr common_init_from_params(common_params & params) { - common_init_result_ptr res(new common_init_result(params)); +common_init_result_ptr common_init_from_params(common_params & params, bool model_only) { + common_init_result_ptr res(new common_init_result(params, model_only)); llama_model * model = res->model(); if (model == NULL) { @@ -1318,6 +1322,10 @@ common_init_result_ptr common_init_from_params(common_params & params) { return res; } + if (model_only) { + return res; + } + llama_context * lctx = res->context(); if (lctx == NULL) { LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str()); diff --git a/common/common.h b/common/common.h index 1d3d788b2..e03f70374 100644 --- a/common/common.h +++ b/common/common.h @@ -857,7 +857,7 @@ struct common_sampler; // note: defines the model, context, samplers, ets. lifetimes struct common_init_result { - common_init_result(common_params & params); + common_init_result(common_params & params, bool model_only = false); ~common_init_result(); llama_model * model(); @@ -875,7 +875,7 @@ private: using common_init_result_ptr = std::unique_ptr; -common_init_result_ptr common_init_from_params(common_params & params); +common_init_result_ptr common_init_from_params(common_params & params, bool model_only = false); struct llama_model_params common_model_params_to_llama ( common_params & params); struct llama_context_params common_context_params_to_llama(const common_params & params); diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index e6f5e9802..97ab7c6de 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -1,22 +1,296 @@ #include "arg.h" #include "common.h" -#include "llama.h" +#include "log.h" +#include "llama-cpp.h" #include #include -#include + +struct llama_batch_ptr { + llama_batch batch; + + llama_batch_ptr(int32_t n_tokens, int32_t embd, int32_t n_seq_max) + : batch{llama_batch_init(n_tokens, embd, n_seq_max)} {} + + ~llama_batch_ptr() { llama_batch_free(batch); } + + llama_batch_ptr(const llama_batch_ptr &) = delete; + llama_batch_ptr & operator=(const llama_batch_ptr &) = delete; + llama_batch_ptr(llama_batch_ptr &&) = default; + llama_batch_ptr & operator=(llama_batch_ptr &&) = default; + + llama_batch & get() { return batch; } + const llama_batch & get() const { return batch; } +}; + +static std::string generate_tokens(llama_context * ctx, llama_sampler * smpl, int & n_past, int32_t n_predict, llama_seq_id seq_id) { + std::string result; + llama_batch_ptr batch(1, 0, 1); + + for (int i = 0; i < n_predict; i++) { + auto next_token = llama_sampler_sample(smpl, ctx, -1); + auto next_token_str = common_token_to_piece(ctx, next_token); + + LOG("%s", next_token_str.c_str()); + result += next_token_str; + + common_batch_clear(batch.get()); + common_batch_add(batch.get(), next_token, n_past, {seq_id}, true); + + if (llama_decode(ctx, batch.get())) { + LOG_ERR("\n%s: failed to evaluate\n", __func__); + return {}; + } + n_past++; + } + + return result; +} + +// Test 1: baseline +// - tokenize the prompt +// - decode all but the last token +// - save state to disk +// - decode the last token +// - generate n_predict tokens +static std::string test_baseline(struct llama_model * model, const struct common_params & params) { + auto ctx = llama_context_ptr{llama_init_from_model(model, common_context_params_to_llama(params))}; + + auto sparams = llama_sampler_chain_default_params(); + auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)}; + llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed)); + + auto tokens = common_tokenize(ctx.get(), params.prompt, true); + + auto n_past = 0; + if (!common_prompt_batch_decode(ctx.get(), tokens, n_past, params.n_batch, params.out_file, true)) { + LOG_ERR("%s: failed to decode prompt\n", __func__); + return {}; + } + + LOG("\n=== Test 1: baseline ===\n"); + LOG("%s", params.prompt.c_str()); + + auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 0); + if (result.empty()) { + return {}; + } + + LOG("\n"); + + return result; +} + + +// Test 2: state load +// - create a new context +// - load state from file +// - replay the last prompt token +// - generate n_predict tokens and compare against expected result +static bool test_state_load(struct llama_model * model, const struct common_params & params, const std::string & expected_result) { + auto ctx = llama_context_ptr{llama_init_from_model(model, common_context_params_to_llama(params))}; + + auto sparams = llama_sampler_chain_default_params(); + auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)}; + llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed)); + + auto tokens = common_tokenize(ctx.get(), params.prompt, true); + + LOG("\n=== Test 2: state load ===\n"); + LOG("%s", params.prompt.c_str()); + + // Load state from file + std::vector unused_sts(tokens.size()); + size_t n_token_count_out = 0; + + if (!llama_state_load_file(ctx.get(), params.out_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { + LOG_ERR("\n%s: failed to load state\n", __func__); + return false; + } + + LOG_TRC("%s: loaded state with %zu tokens\n", __func__, n_token_count_out); + + // Replay last token + int n_past = (int) n_token_count_out; + if (!common_replay_last_token(ctx.get(), tokens.back(), n_past)) { + return false; + } + n_past++; + + // Generate tokens + auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 0); + if (result.empty()) { + return false; + } + + if (result != expected_result) { + LOG_ERR("\n%s: error: generation differs from expected\n", __func__); + return false; + } + + LOG("\nPASS\n"); + return true; +} + + +// Test 3: seq copy (host) +// - create a multi-seq context +// - load state from file +// - replay the last prompt token +// - migrate KV cache from seq 0 to seq 1 via the CPU path +// - generate n_predict tokens on seq 1 and compare against expected result +static bool test_seq_cp_host(struct llama_model * model, const struct common_params & params, const std::string & expected_result) { + auto params_ctx = common_context_params_to_llama(params); + params_ctx.n_seq_max = 2; + auto ctx = llama_context_ptr{llama_init_from_model(model, params_ctx)}; + + auto sparams = llama_sampler_chain_default_params(); + auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)}; + llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed)); + + auto tokens = common_tokenize(ctx.get(), params.prompt, true); + + LOG("\n=== Test 3: seq copy (host) ===\n"); + LOG("%s", params.prompt.c_str()); + + // Load state from file + std::vector unused_sts(tokens.size()); + size_t n_token_count_out = 0; + + if (!llama_state_load_file(ctx.get(), params.out_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { + LOG_ERR("\n%s: failed to load state\n", __func__); + return false; + } + + LOG_TRC("%s: loaded state with %zu tokens\n", __func__, n_token_count_out); + + // Replay last token + int n_past = (int) n_token_count_out; + if (!common_replay_last_token(ctx.get(), tokens.back(), n_past)) { + return false; + } + n_past++; + + // Migrate KV cache from seq 0 to seq 1 (CPU path) + { + std::vector seq_store(llama_state_seq_get_size(ctx.get(), 0)); + const size_t ncopy = llama_state_seq_get_data(ctx.get(), seq_store.data(), seq_store.size(), 0); + if (ncopy != seq_store.size()) { + LOG_ERR("\n%s: seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size()); + return false; + } + LOG_TRC("%s: seq 0 copied, %zd bytes\n", __func__, ncopy); + + llama_memory_clear(llama_get_memory(ctx.get()), true); + LOG_TRC("%s: kv cache cleared\n", __func__); + + const size_t nset = llama_state_seq_set_data(ctx.get(), seq_store.data(), seq_store.size(), 1); + if (nset != seq_store.size()) { + LOG_ERR("\n%s: seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size()); + return false; + } + LOG_TRC("%s: seq 1 restored, %zd bytes\n", __func__, nset); + } + + // Generate tokens on seq 1 + auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 1); + if (result.empty()) { + return false; + } + + if (result != expected_result) { + LOG_ERR("\n%s: error: generation differs from expected\n", __func__); + return false; + } + + LOG("\nPASS\n"); + return true; +} + + +// Test 4: seq copy (device) +// - create a multi-seq context +// - load state from file +// - replay the last prompt token +// - migrate KV cache from seq 0 to seq 1 via the on-device path +// - generate n_predict tokens on seq 1 and compare against expected result +static bool test_seq_cp_device(struct llama_model * model, const struct common_params & params, const std::string & expected_result) { + auto params_ctx = common_context_params_to_llama(params); + params_ctx.n_seq_max = 2; + auto ctx = llama_context_ptr{llama_init_from_model(model, params_ctx)}; + + auto sparams = llama_sampler_chain_default_params(); + auto smpl = llama_sampler_ptr{llama_sampler_chain_init(sparams)}; + llama_sampler_chain_add(smpl.get(), llama_sampler_init_dist(params.sampling.seed)); + + auto tokens = common_tokenize(ctx.get(), params.prompt, true); + + LOG("\n=== Test 4: seq copy (device) ===\n"); + LOG("%s", params.prompt.c_str()); + + // Load state from file + std::vector unused_sts(tokens.size()); + size_t n_token_count_out = 0; + + if (!llama_state_load_file(ctx.get(), params.out_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { + LOG_ERR("\n%s: failed to load state\n", __func__); + return false; + } + + LOG_TRC("%s: loaded state with %zu tokens\n", __func__, n_token_count_out); + + // Replay last token + int n_past = (int) n_token_count_out; + if (!common_replay_last_token(ctx.get(), tokens.back(), n_past)) { + return false; + } + n_past++; + + // Migrate KV cache from seq 0 to seq 1 (on-device path) + { + std::vector seq_store(llama_state_seq_get_size_ext(ctx.get(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE)); + const size_t ncopy = llama_state_seq_get_data_ext(ctx.get(), seq_store.data(), seq_store.size(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + if (ncopy != seq_store.size()) { + LOG_ERR("\n%s: seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size()); + return false; + } + LOG_TRC("%s: seq 0 copied, %zd bytes\n", __func__, ncopy); + + llama_memory_clear(llama_get_memory(ctx.get()), true); + LOG_TRC("%s: kv cache cleared\n", __func__); + + const size_t nset = llama_state_seq_set_data_ext(ctx.get(), seq_store.data(), seq_store.size(), 1, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); + if (nset != seq_store.size()) { + LOG_ERR("\n%s: seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size()); + return false; + } + LOG_TRC("%s: seq 1 restored, %zd bytes\n", __func__, nset); + } + + // Generate tokens on seq 1 + auto result = generate_tokens(ctx.get(), smpl.get(), n_past, params.n_predict, 1); + if (result.empty()) { + return false; + } + + if (result != expected_result) { + LOG_ERR("\n%s: error: generation differs from expected\n", __func__); + return false; + } + + LOG("\nPASS\n"); + return true; +} int main(int argc, char ** argv) { std::setlocale(LC_NUMERIC, "C"); common_params params; - params.prompt = "The quick brown fox"; + params.out_file = "dump_state.bin"; params.sampling.seed = 1234; - const std::string_view state_file = "dump_state.bin"; - common_init(); if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { @@ -24,8 +298,7 @@ int main(int argc, char ** argv) { } if (params.n_parallel == 1) { - // the example uses 2 sequences, so when n_parallel == 1, we need to enable unified kv cache - printf("%s: n_parallel == 1, enabling unified kv cache\n", __func__); + LOG_TRC("%s: n_parallel == 1, enabling unified kv cache\n", __func__); params.kv_unified = true; } @@ -33,288 +306,40 @@ int main(int argc, char ** argv) { params.n_predict = 16; } - auto n_past = 0; - - std::string result0; - std::string result1; - std::string result2; - std::string result3; - - // init - ggml_backend_load_all(); - auto llama_init = common_init_from_params(params); - + auto llama_init = common_init_from_params(params, true); auto * model = llama_init->model(); - auto * ctx = llama_init->context(); - if (model == nullptr || ctx == nullptr) { - fprintf(stderr, "%s : failed to init\n", __func__); + if (model == nullptr) { + LOG_ERR("%s: failed to init\n", __func__); return 1; } - auto sparams = llama_sampler_chain_default_params(); + GGML_ASSERT(llama_init->context() == nullptr); - llama_sampler * smpl = llama_sampler_chain_init(sparams); - - llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sampling.seed)); - - // tokenize prompt - auto tokens = common_tokenize(ctx, params.prompt, true); - - const bool save_state = true; - if (!common_prompt_batch_decode(ctx, tokens, n_past, params.n_batch, state_file, save_state)) { + // Test 1: baseline (saves state to disk) + auto result_baseline = test_baseline(model, params); + if (result_baseline.empty()) { return 1; } - // first run - printf("\nfirst run: %s", params.prompt.c_str()); - - llama_batch batch = llama_batch_init(1, 0, 1); - - for (auto i = 0; i < params.n_predict; i++) { - auto next_token = llama_sampler_sample(smpl, ctx, -1); - auto next_token_str = common_token_to_piece(ctx, next_token); - - printf("%s", next_token_str.c_str()); - result0 += next_token_str; - - common_batch_clear(batch); - common_batch_add(batch, next_token, n_past, {0}, true); - - if (llama_decode(ctx, batch)) { - fprintf(stderr, "\n%s : failed to evaluate\n", __func__); - llama_batch_free(batch); - return 1; - } - n_past += 1; - } - - printf("\n\n"); - - // make new context - llama_context * ctx2 = llama_init_from_model(model, common_context_params_to_llama(params)); - - llama_sampler * smpl2 = llama_sampler_chain_init(sparams); - - llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sampling.seed)); - - printf("\nsecond run: %s", params.prompt.c_str()); - - // load state from file - std::vector unused_sts(tokens.size()); // unused session tokens. - size_t n_token_count_out = 0; - - if (!llama_state_load_file(ctx2, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { - fprintf(stderr, "\n%s : failed to load state\n", __func__); + // Test 2: state load + if (!test_state_load(model, params, result_baseline)) { return 1; } - fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out); - - // restore state (last tokens) - n_past = n_token_count_out; - if (!common_replay_last_token(ctx2, tokens.back(), n_past)) { - return 1; - } - ++n_past; - - // second run - for (auto i = 0; i < params.n_predict; i++) { - auto next_token = llama_sampler_sample(smpl2, ctx2, -1); - auto next_token_str = common_token_to_piece(ctx2, next_token); - - printf("%s", next_token_str.c_str()); - result1 += next_token_str; - - common_batch_clear(batch); - common_batch_add(batch, next_token, n_past, {0}, true); - - if (llama_decode(ctx2, batch)) { - fprintf(stderr, "\n%s : failed to evaluate\n", __func__); - llama_batch_free(batch); - return 1; - } - n_past += 1; - } - - printf("\n\n"); - - if (result0 != result1) { - fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__); + // Test 3: seq copy (host) + if (!test_seq_cp_host(model, params, result_baseline)) { return 1; } - // make new context - auto params_ctx3 = common_context_params_to_llama(params); - params_ctx3.n_seq_max = 2; - llama_context * ctx3 = llama_init_from_model(model, params_ctx3); - - llama_sampler * smpl3 = llama_sampler_chain_init(sparams); - - llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sampling.seed)); - - printf("\nsingle seq run: %s", params.prompt.c_str()); - - // load state (rng, logits, embedding and kv_cache) from file - n_token_count_out = 0; - - if (!llama_state_load_file(ctx3, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { - fprintf(stderr, "\n%s : failed to load state\n", __func__); + // Test 4: seq copy (device) + if (!test_seq_cp_device(model, params, result_baseline)) { return 1; } - fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out); - - // restore state (last tokens) - n_past = n_token_count_out; - if (!common_replay_last_token(ctx3, tokens.back(), n_past)) { - return 1; - } - ++n_past; - - // save seq 0 and load into seq 1 - { - // save kv of seq 0 - std::vector seq_store(llama_state_seq_get_size(ctx3, 0)); - const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), seq_store.size(), 0); - if (ncopy != seq_store.size()) { - fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size()); - return 1; - } - fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy); - - // erase whole kv - llama_memory_clear(llama_get_memory(ctx3), true); - fprintf(stderr, "%s : kv cache cleared\n", __func__); - - // restore kv into seq 1 - const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), seq_store.size(), 1); - if (nset != seq_store.size()) { - fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size()); - return 1; - } - fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset); - } - - // third run with seq 1 instead of 0 - for (auto i = 0; i < params.n_predict; i++) { - auto next_token = llama_sampler_sample(smpl3, ctx3, -1); - auto next_token_str = common_token_to_piece(ctx3, next_token); - - printf("%s", next_token_str.c_str()); - result2 += next_token_str; - - common_batch_clear(batch); - common_batch_add(batch, next_token, n_past, {1}, true); - - if (llama_decode(ctx3, batch)) { - fprintf(stderr, "\n%s : failed to evaluate\n", __func__); - llama_batch_free(batch); - return 1; - } - n_past += 1; - } - - // test on-device state save/load - auto params_ctx4 = common_context_params_to_llama(params); - params_ctx4.n_seq_max = 2; - llama_context * ctx4 = llama_init_from_model(model, params_ctx4); - - llama_sampler * smpl4 = llama_sampler_chain_init(sparams); - - llama_sampler_chain_add(smpl4, llama_sampler_init_dist(params.sampling.seed)); - - printf("\nsingle seq run: %s", params.prompt.c_str()); - - // load state (rng, logits, embedding and kv_cache) from file - n_token_count_out = 0; - - if (!llama_state_load_file(ctx4, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { - fprintf(stderr, "\n%s : failed to load state\n", __func__); - return 1; - } - - fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out); - - // restore state (last tokens) - n_past = n_token_count_out; - if (!common_replay_last_token(ctx4, tokens.back(), n_past)) { - return 1; - } - ++n_past; - - // save seq 0 and load into seq 1 - { - // save kv of seq 0 - std::vector seq_store(llama_state_seq_get_size_ext(ctx4, 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE)); - const size_t ncopy = llama_state_seq_get_data_ext(ctx4, seq_store.data(), seq_store.size(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); - if (ncopy != seq_store.size()) { - fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size()); - return 1; - } - fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy); - - // erase whole kv - llama_memory_clear(llama_get_memory(ctx4), true); - fprintf(stderr, "%s : kv cache cleared\n", __func__); - - // restore kv into seq 0 - const size_t nset = llama_state_seq_set_data_ext(ctx4, seq_store.data(), seq_store.size(), 1, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE); - if (nset != seq_store.size()) { - fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size()); - return 1; - } - fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset); - } - - // forth run - for (auto i = 0; i < params.n_predict; i++) { - auto next_token = llama_sampler_sample(smpl4, ctx4, -1); - auto next_token_str = common_token_to_piece(ctx4, next_token); - - printf("%s", next_token_str.c_str()); - result3 += next_token_str; - - common_batch_clear(batch); - common_batch_add(batch, next_token, n_past, {1}, true); - - if (llama_decode(ctx4, batch)) { - fprintf(stderr, "\n%s : failed to evaluate\n", __func__); - llama_batch_free(batch); - return 1; - } - n_past += 1; - } - - printf("\n"); - - llama_sampler_free(smpl); - llama_sampler_free(smpl2); - llama_sampler_free(smpl3); - llama_sampler_free(smpl4); - - llama_batch_free(batch); - - // this one is managed by common_init_result - //llama_free(ctx); - - llama_free(ctx2); - llama_free(ctx3); - llama_free(ctx4); - - if (result0 != result2) { - fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__); - return 1; - } - - if (result0 != result3) { - fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__); - return 1; - } - - fprintf(stderr, "\n%s : success\n", __func__); + LOG("\nAll tests passed.\n"); return 0; } From 3c81c8deeabba01fa40869325ea80d07eef75fc6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 19 May 2026 09:46:58 +0300 Subject: [PATCH 15/33] server : print graphs reused in slot timings (#23279) Add graphs reused counter to the per-slot timing output, printed via llama_perf_context(). Assisted-by: llama.cpp:local pi Co-authored-by: ggerganov --- tools/server/server-context.cpp | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 6b16c6b49..88b207ad5 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -467,20 +467,26 @@ struct server_slot { const double n_gen_second = 1e3 / t_token_generation * n_decoded; SLT_INF(*this, - "\n" - "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" - " eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" + "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second); + + SLT_INF(*this, + " eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + t_token_generation, n_decoded, t_gen, n_gen_second); + + SLT_INF(*this, " total time = %10.2f ms / %5d tokens\n", - t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second, - t_token_generation, n_decoded, t_gen, n_gen_second, t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded); + SLT_INF(*this, + " graphs reused = %10d\n", + llama_perf_context(ctx_tgt).n_reused); + if (n_draft_total > 0) { const float draft_ratio = (float) n_draft_accepted / n_draft_total; - SLT_CNT(*this, - "draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n", - draft_ratio, n_draft_accepted, n_draft_total - ); + SLT_INF(*this, + "draft acceptance = %0.5f (%5d accepted / %5d generated)\n", + draft_ratio, n_draft_accepted, n_draft_total); } common_speculative_print_stats(spec); From ccee42642677005555b28c6ef93760e2604348e8 Mon Sep 17 00:00:00 2001 From: Pascal Date: Tue, 19 May 2026 08:49:01 +0200 Subject: [PATCH 16/33] server-context: guarantee there is at least 1 token to decode (#23280) --- tools/server/server-context.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 88b207ad5..dc3189e17 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2589,9 +2589,9 @@ private: llama_pos pos_next = slot.prompt.tokens.pos_next(n_past); // the largest pos_min required for a checkpoint to be useful - const auto pos_min_thold = std::max(0, pos_next - n_swa); + const auto pos_min_thold = std::max(0, pos_next - n_swa - 1); - if (n_past > 0 && n_past < slot.prompt.n_tokens()) { + if (n_past > 0 && n_past <= slot.prompt.n_tokens()) { const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), slot.id); if (pos_min == -1) { SLT_ERR(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min); From 00c461ce1a9deb238eed40a8f869a72729fa3d4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Tue, 19 May 2026 09:06:56 +0200 Subject: [PATCH 17/33] ci : install server kleidiai runner dependencies (#23259) --- .github/workflows/server-self-hosted.yml | 26 ++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/.github/workflows/server-self-hosted.yml b/.github/workflows/server-self-hosted.yml index d06ad3d24..3522681d9 100644 --- a/.github/workflows/server-self-hosted.yml +++ b/.github/workflows/server-self-hosted.yml @@ -152,6 +152,32 @@ jobs: fetch-depth: 0 ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} + - name: Dependencies + id: depends + run: | + set -euxo pipefail + sudo apt-get update + sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \ + apt-get install -y \ + build-essential \ + python3-venv \ + gpg \ + wget \ + time \ + git-lfs + + git lfs install + + # install the latest cmake + sudo install -d /usr/share/keyrings + wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \ + | gpg --dearmor \ + | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null + echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \ + | sudo tee /etc/apt/sources.list.d/kitware.list + sudo apt-get update + sudo apt-get install -y cmake + - name: Build id: cmake_build run: | From 4b262ab662d46fd9dd1d53671b82c09d8b0af024 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Tue, 19 May 2026 10:11:04 +0200 Subject: [PATCH 18/33] ci : install libssl-dev (#23325) --- .github/workflows/server-self-hosted.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/server-self-hosted.yml b/.github/workflows/server-self-hosted.yml index 3522681d9..857c72a46 100644 --- a/.github/workflows/server-self-hosted.yml +++ b/.github/workflows/server-self-hosted.yml @@ -160,6 +160,7 @@ jobs: sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \ apt-get install -y \ build-essential \ + libssl-dev \ python3-venv \ gpg \ wget \ From 6db130445d29b243ee2171efb8cd61b84a1c5322 Mon Sep 17 00:00:00 2001 From: Aleksander Grygier Date: Tue, 19 May 2026 10:16:04 +0200 Subject: [PATCH 19/33] ui: Bump packages + address build warnings (#23300) * chore: Update vulnerable packages * chore: Formatting * refactor: Update Tailwind CSS imports * ci: Use `ubuntu-latest` for Unit/E2E UI tests * chore: Bump package * fix: Add missing tag * refactor: Enums files naming --- .github/workflows/ui-ci.yml | 4 +- tools/ui/.gitignore | 2 +- tools/ui/eslint.config.js | 5 +- tools/ui/package-lock.json | 54 ++++---- tools/ui/src/app.css | 5 +- ...tAttachmentsPreviewCurrentItemVideo.svelte | 1 + .../MarkdownContent/MarkdownContent.svelte | 2 +- .../settings/SettingsChat/SettingsChat.svelte | 2 +- .../SettingsChat/SettingsChatFields.svelte | 2 +- tools/ui/src/lib/constants/mcp.ts | 2 +- .../ui/src/lib/constants/settings-registry.ts | 4 +- .../src/lib/constants/supported-file-types.ts | 2 +- tools/ui/src/lib/constants/tools.ts | 2 +- .../enums/{agentic.ts => agentic.enums.ts} | 0 .../{attachment.ts => attachment.enums.ts} | 0 .../src/lib/enums/{chat.ts => chat.enums.ts} | 0 .../lib/enums/{files.ts => files.enums.ts} | 0 tools/ui/src/lib/enums/index.ts | 22 +-- .../enums/{keyboard.ts => keyboard.enums.ts} | 0 .../ui/src/lib/enums/{mcp.ts => mcp.enums.ts} | 0 .../lib/enums/{model.ts => model.enums.ts} | 0 .../lib/enums/{server.ts => server.enums.ts} | 0 .../enums/{settings.ts => settings.enums.ts} | 0 .../lib/enums/{tools.ts => tools.enums.ts} | 0 tools/ui/src/lib/enums/{ui.ts => ui.enums.ts} | 0 tools/ui/src/lib/types/mcp.d.ts | 4 +- tools/ui/src/routes/+layout.svelte | 29 ++-- tools/ui/vitest-setup-client.ts | 130 +++++++++--------- tools/ui/vitest.shims.d.ts | 1 + 29 files changed, 141 insertions(+), 132 deletions(-) rename tools/ui/src/lib/enums/{agentic.ts => agentic.enums.ts} (100%) rename tools/ui/src/lib/enums/{attachment.ts => attachment.enums.ts} (100%) rename tools/ui/src/lib/enums/{chat.ts => chat.enums.ts} (100%) rename tools/ui/src/lib/enums/{files.ts => files.enums.ts} (100%) rename tools/ui/src/lib/enums/{keyboard.ts => keyboard.enums.ts} (100%) rename tools/ui/src/lib/enums/{mcp.ts => mcp.enums.ts} (100%) rename tools/ui/src/lib/enums/{model.ts => model.enums.ts} (100%) rename tools/ui/src/lib/enums/{server.ts => server.enums.ts} (100%) rename tools/ui/src/lib/enums/{settings.ts => settings.enums.ts} (100%) rename tools/ui/src/lib/enums/{tools.ts => tools.enums.ts} (100%) rename tools/ui/src/lib/enums/{ui.ts => ui.enums.ts} (100%) create mode 100644 tools/ui/vitest.shims.d.ts diff --git a/.github/workflows/ui-ci.yml b/.github/workflows/ui-ci.yml index 7f6f467dd..761a93194 100644 --- a/.github/workflows/ui-ci.yml +++ b/.github/workflows/ui-ci.yml @@ -41,7 +41,7 @@ jobs: ui-checks: name: UI Checks needs: ui-build - runs-on: ubuntu-slim + runs-on: ubuntu-latest continue-on-error: true steps: - name: Checkout code @@ -93,7 +93,7 @@ jobs: e2e-tests: name: E2E Tests needs: ui-build - runs-on: ubuntu-slim + runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v6 diff --git a/tools/ui/.gitignore b/tools/ui/.gitignore index 051d884b0..22ed6125f 100644 --- a/tools/ui/.gitignore +++ b/tools/ui/.gitignore @@ -25,4 +25,4 @@ vite.config.ts.timestamp-* *storybook.log storybook-static -*.code-workspace \ No newline at end of file +*.code-workspace diff --git a/tools/ui/eslint.config.js b/tools/ui/eslint.config.js index 185da1dab..4ed9dd7ca 100644 --- a/tools/ui/eslint.config.js +++ b/tools/ui/eslint.config.js @@ -20,9 +20,7 @@ export default ts.config( prettier, ...svelte.configs.prettier, { - languageOptions: { - globals: { ...globals.browser, ...globals.node } - }, + languageOptions: { globals: { ...globals.browser, ...globals.node } }, rules: { // typescript-eslint strongly recommend that you do not use the no-undef lint rule on TypeScript projects. // see: https://typescript-eslint.io/troubleshooting/faqs/eslint/#i-get-errors-from-the-no-undef-rule-about-global-variables-not-being-defined-even-though-there-are-no-typescript-errors @@ -30,6 +28,7 @@ export default ts.config( 'svelte/no-at-html-tags': 'off', // This app uses hash-based routing (#/) where resolve() from $app/paths does not apply 'svelte/no-navigation-without-resolve': 'off', + // Enforce empty line at end of file 'eol-last': 'error' } diff --git a/tools/ui/package-lock.json b/tools/ui/package-lock.json index 3686eb326..4d012c819 100644 --- a/tools/ui/package-lock.json +++ b/tools/ui/package-lock.json @@ -2307,9 +2307,9 @@ } }, "node_modules/@sveltejs/kit": { - "version": "2.59.1", - "resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.59.1.tgz", - "integrity": "sha512-d8OON70AphLdDesuTIl//M2O6fRTIicX8aYv8vhCiYEhTTI2OboKqey0Hu1A4VFhqwgqtq0vKDmPFGkw8kKmgw==", + "version": "2.60.1", + "resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.60.1.tgz", + "integrity": "sha512-mQjlkNo+rJvpln7V2IGY2j99BqhcFbS4UN0AQNKNYfhBAFZTuCDAdW3a1sgf330mvtNvsBXn3HpAhcmvdJTcIQ==", "dev": true, "license": "MIT", "dependencies": { @@ -2318,7 +2318,7 @@ "@types/cookie": "^0.6.0", "acorn": "^8.14.1", "cookie": "^0.6.0", - "devalue": "^5.6.4", + "devalue": "^5.8.1", "esm-env": "^1.2.2", "kleur": "^4.1.5", "magic-string": "^0.30.5", @@ -4296,9 +4296,9 @@ } }, "node_modules/devalue": { - "version": "5.6.4", - "resolved": "https://registry.npmjs.org/devalue/-/devalue-5.6.4.tgz", - "integrity": "sha512-Gp6rDldRsFh/7XuouDbxMH3Mx8GMCcgzIb1pDTvNyn8pZGQ22u+Wa+lGV9dQCltFQ7uVw0MhRyb8XDskNFOReA==", + "version": "5.8.1", + "resolved": "https://registry.npmjs.org/devalue/-/devalue-5.8.1.tgz", + "integrity": "sha512-4CXDYRBGqN+57wVJkuXBYmpAVUSg3L6JAQa/DFqm238G73E1wuyc/JhGQJzN7vUf/CMphYau2zXbfWzDR5aTEw==", "license": "MIT" }, "node_modules/devlop": { @@ -4856,12 +4856,12 @@ } }, "node_modules/express-rate-limit": { - "version": "8.5.0", - "resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.5.0.tgz", - "integrity": "sha512-XKhFohWaSBdVJNTi5TaHziqnPkv04I9UQV6q1Wy7Ui6GGQZVW12ojDFwqer14EvCXxjvPG0CyWXx7cAXpALB4Q==", + "version": "8.5.2", + "resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.5.2.tgz", + "integrity": "sha512-5Kb34ipNX694DH48vN9irak1Qx30nb0PLYHXfJgw4YEjiC3ZEmZJhwOp+VfiCYwFzvFTdB9QkArYS5kXa2cx2A==", "license": "MIT", "dependencies": { - "ip-address": "10.1.0" + "ip-address": "^10.2.0" }, "engines": { "node": ">= 16" @@ -4909,9 +4909,9 @@ "license": "MIT" }, "node_modules/fast-uri": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.0.tgz", - "integrity": "sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA==", + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.2.tgz", + "integrity": "sha512-rVjf7ArG3LTk+FS6Yw81V1DLuZl1bRbNrev6Tmd/9RaroeeRRJhAt7jg/6YFxbvAQXUCavSoZhPPj6oOx+5KjQ==", "funding": [ { "type": "github", @@ -5541,9 +5541,9 @@ } }, "node_modules/hono": { - "version": "4.12.14", - "resolved": "https://registry.npmjs.org/hono/-/hono-4.12.14.tgz", - "integrity": "sha512-am5zfg3yu6sqn5yjKBNqhnTX7Cv+m00ox+7jbaKkrLMRJ4rAdldd1xPd/JzbBWspqaQv6RSTrgFN95EsfhC+7w==", + "version": "4.12.19", + "resolved": "https://registry.npmjs.org/hono/-/hono-4.12.19.tgz", + "integrity": "sha512-xa3eYXYXx68XTT4hZ7dRzsXBhaq85ToSrlUJNoR0gwz/1Ap/CNwX47wfvV7pc/xWhjKVVkLT7zBJy8chhNguqQ==", "license": "MIT", "engines": { "node": ">=16.9.0" @@ -5722,9 +5722,9 @@ "license": "MIT" }, "node_modules/ip-address": { - "version": "10.1.0", - "resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.1.0.tgz", - "integrity": "sha512-XXADHxXmvT9+CRxhXg56LJovE+bmWnEWB78LB83VZTprKTmaC5QfruXocxzTZ2Kl0DNwKuBdlIhjL8LeY8Sf8Q==", + "version": "10.2.0", + "resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.2.0.tgz", + "integrity": "sha512-/+S6j4E9AHvW9SWMSEY9Xfy66O5PWvVEJ08O0y5JGyEKQpojb0K0GKpz/v5HJ/G0vi3D2sjGK78119oXZeE0qA==", "license": "MIT", "engines": { "node": ">= 12" @@ -9245,9 +9245,9 @@ } }, "node_modules/svelte": { - "version": "5.55.1", - "resolved": "https://registry.npmjs.org/svelte/-/svelte-5.55.1.tgz", - "integrity": "sha512-QjvU7EFemf6mRzdMGlAFttMWtAAVXrax61SZYHdkD6yoVGQ89VeyKfZD4H1JrV1WLmJBxWhFch9H6ig/87VGjw==", + "version": "5.55.7", + "resolved": "https://registry.npmjs.org/svelte/-/svelte-5.55.7.tgz", + "integrity": "sha512-ymI5ykLPwIHW839E053FQbI1G+jnRFJEw3Kv5Y4njixVWywQBx+NUFpkkKyk5LIb36Fg9DVXSYpqiGekLD0hyw==", "license": "MIT", "dependencies": { "@jridgewell/remapping": "^2.3.4", @@ -9259,7 +9259,7 @@ "aria-query": "5.3.1", "axobject-query": "^4.1.0", "clsx": "^2.1.1", - "devalue": "^5.6.4", + "devalue": "^5.8.1", "esm-env": "^1.2.1", "esrap": "^2.2.4", "is-reference": "^3.0.3", @@ -10606,9 +10606,9 @@ "license": "ISC" }, "node_modules/ws": { - "version": "8.18.3", - "resolved": "https://registry.npmjs.org/ws/-/ws-8.18.3.tgz", - "integrity": "sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==", + "version": "8.20.1", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.20.1.tgz", + "integrity": "sha512-It4dO0K5v//JtTXuPkfEOaI3uUN87iYPnqo/ZzqCoG3g8uhA66QUMs/SrM0YK7/NAu+r4LMh/9dq2A7k+rHs+w==", "dev": true, "license": "MIT", "engines": { diff --git a/tools/ui/src/app.css b/tools/ui/src/app.css index d6dc6670c..29b1d3c64 100644 --- a/tools/ui/src/app.css +++ b/tools/ui/src/app.css @@ -1,6 +1,7 @@ @import 'tailwindcss'; -@source "."; - +@source '.'; +@plugin '@tailwindcss/forms'; +@plugin '@tailwindcss/typography'; @import 'tw-animate-css'; @custom-variant dark (&:is(.dark *)); diff --git a/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemVideo.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemVideo.svelte index 4ebbd5922..62040b36f 100644 --- a/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemVideo.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsPreview/ChatAttachmentsPreviewCurrentItem/ChatAttachmentsPreviewCurrentItemVideo.svelte @@ -15,6 +15,7 @@ {#if videoSrc} {:else} diff --git a/tools/ui/src/lib/components/app/content/MarkdownContent/MarkdownContent.svelte b/tools/ui/src/lib/components/app/content/MarkdownContent/MarkdownContent.svelte index 3a11854b6..0412414ae 100644 --- a/tools/ui/src/lib/components/app/content/MarkdownContent/MarkdownContent.svelte +++ b/tools/ui/src/lib/components/app/content/MarkdownContent/MarkdownContent.svelte @@ -28,7 +28,7 @@ SETTINGS_KEYS } from '$lib/constants'; import { ColorMode, UrlProtocol } from '$lib/enums'; - import { FileTypeText } from '$lib/enums/files'; + import { FileTypeText } from '$lib/enums/files.enums'; import { highlightCode, detectIncompleteCodeBlock, type IncompleteCodeBlock } from '$lib/utils'; import '$styles/katex-custom.scss'; import githubDarkCss from 'highlight.js/styles/github-dark.css?inline'; diff --git a/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChat.svelte b/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChat.svelte index 109c8ff9d..d017fe204 100644 --- a/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChat.svelte +++ b/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChat.svelte @@ -17,7 +17,7 @@ } from '$lib/constants'; import { RouterService } from '$lib/services/router.service'; import { setMode } from 'mode-watcher'; - import { ColorMode } from '$lib/enums/ui'; + import { ColorMode } from '$lib/enums/ui.enums'; import { fade } from 'svelte/transition'; import { goto } from '$app/navigation'; import { page } from '$app/state'; diff --git a/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChatFields.svelte b/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChatFields.svelte index 069855eeb..7c1c5c897 100644 --- a/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChatFields.svelte +++ b/tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChatFields.svelte @@ -6,7 +6,7 @@ import * as Select from '$lib/components/ui/select'; import { Textarea } from '$lib/components/ui/textarea'; import { SETTING_CONFIG_INFO, SETTINGS_KEYS } from '$lib/constants'; - import { SettingsFieldType } from '$lib/enums/settings'; + import { SettingsFieldType } from '$lib/enums/settings.enums'; import { settingsStore } from '$lib/stores/settings.svelte'; import { serverStore } from '$lib/stores/server.svelte'; import { modelsStore, selectedModelName, propsCacheVersion } from '$lib/stores/models.svelte'; diff --git a/tools/ui/src/lib/constants/mcp.ts b/tools/ui/src/lib/constants/mcp.ts index 19bdd92ea..918eb9f94 100644 --- a/tools/ui/src/lib/constants/mcp.ts +++ b/tools/ui/src/lib/constants/mcp.ts @@ -2,7 +2,7 @@ import { Zap, Globe, Radio } from '@lucide/svelte'; import { MCPTransportType } from '$lib/enums'; import type { ClientCapabilities, Implementation } from '$lib/types'; import type { Component } from 'svelte'; -import { MimeTypeImage } from '$lib/enums/files'; +import { MimeTypeImage } from '$lib/enums/files.enums'; export const DEFAULT_CLIENT_VERSION = '1.0.0'; export const MCP_CLIENT_NAME = 'llama-ui-mcp'; diff --git a/tools/ui/src/lib/constants/settings-registry.ts b/tools/ui/src/lib/constants/settings-registry.ts index bdbb17d96..93b3cd5ed 100644 --- a/tools/ui/src/lib/constants/settings-registry.ts +++ b/tools/ui/src/lib/constants/settings-registry.ts @@ -1,5 +1,5 @@ -import { ColorMode } from '$lib/enums/ui'; -import { SettingsFieldType } from '$lib/enums/settings'; +import { ColorMode } from '$lib/enums/ui.enums'; +import { SettingsFieldType } from '$lib/enums/settings.enums'; import { SyncableParameterType } from '$lib/enums'; import { Funnel, diff --git a/tools/ui/src/lib/constants/supported-file-types.ts b/tools/ui/src/lib/constants/supported-file-types.ts index 345054389..414116154 100644 --- a/tools/ui/src/lib/constants/supported-file-types.ts +++ b/tools/ui/src/lib/constants/supported-file-types.ts @@ -18,7 +18,7 @@ import { MimeTypeApplication, MimeTypeText } from '$lib/enums'; -import { FileExtensionVideo, FileTypeVideo } from '$lib/enums/files'; +import { FileExtensionVideo, FileTypeVideo } from '$lib/enums/files.enums'; // File type configuration using enums export const AUDIO_FILE_TYPES = { diff --git a/tools/ui/src/lib/constants/tools.ts b/tools/ui/src/lib/constants/tools.ts index 22b22309c..efc3476cd 100644 --- a/tools/ui/src/lib/constants/tools.ts +++ b/tools/ui/src/lib/constants/tools.ts @@ -1,4 +1,4 @@ -import { ToolSource } from '$lib/enums/tools'; +import { ToolSource } from '$lib/enums/tools.enums'; export const TOOL_GROUP_LABELS = { [ToolSource.BUILTIN]: 'Built-in', diff --git a/tools/ui/src/lib/enums/agentic.ts b/tools/ui/src/lib/enums/agentic.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/agentic.ts rename to tools/ui/src/lib/enums/agentic.enums.ts diff --git a/tools/ui/src/lib/enums/attachment.ts b/tools/ui/src/lib/enums/attachment.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/attachment.ts rename to tools/ui/src/lib/enums/attachment.enums.ts diff --git a/tools/ui/src/lib/enums/chat.ts b/tools/ui/src/lib/enums/chat.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/chat.ts rename to tools/ui/src/lib/enums/chat.enums.ts diff --git a/tools/ui/src/lib/enums/files.ts b/tools/ui/src/lib/enums/files.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/files.ts rename to tools/ui/src/lib/enums/files.enums.ts diff --git a/tools/ui/src/lib/enums/index.ts b/tools/ui/src/lib/enums/index.ts index 3cf81286b..a17cca1d8 100644 --- a/tools/ui/src/lib/enums/index.ts +++ b/tools/ui/src/lib/enums/index.ts @@ -4,9 +4,9 @@ export { AttachmentItemEnabledWhen, AttachmentAction, AttachmentItemVisibleWhen -} from './attachment'; +} from './attachment.enums'; -export { AgenticSectionType, ToolCallType } from './agentic'; +export { AgenticSectionType, ToolCallType } from './agentic.enums'; export { ChatMessageStatsView, @@ -17,7 +17,7 @@ export { MessageType, PdfViewMode, ReasoningFormat -} from './chat'; +} from './chat.enums'; export { FileTypeCategory, @@ -38,7 +38,7 @@ export { MimeTypeImage, MimeTypeText, SpecialFileType -} from './files'; +} from './files.enums'; export { MCPConnectionPhase, @@ -48,16 +48,16 @@ export { MCPContentType, MCPRefType, JsonSchemaType -} from './mcp'; +} from './mcp.enums'; -export { ModelModality } from './model'; +export { ModelModality } from './model.enums'; -export { ServerRole, ServerModelStatus } from './server'; +export { ServerRole, ServerModelStatus } from './server.enums'; -export { ParameterSource, SyncableParameterType, SettingsFieldType } from './settings'; +export { ParameterSource, SyncableParameterType, SettingsFieldType } from './settings.enums'; -export { ColorMode, HtmlInputType, McpPromptVariant, TooltipSide, UrlProtocol } from './ui'; +export { ColorMode, HtmlInputType, McpPromptVariant, TooltipSide, UrlProtocol } from './ui.enums'; -export { KeyboardKey } from './keyboard'; +export { KeyboardKey } from './keyboard.enums'; -export { ToolSource, ToolPermissionDecision, ToolResponseField } from './tools'; +export { ToolSource, ToolPermissionDecision, ToolResponseField } from './tools.enums'; diff --git a/tools/ui/src/lib/enums/keyboard.ts b/tools/ui/src/lib/enums/keyboard.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/keyboard.ts rename to tools/ui/src/lib/enums/keyboard.enums.ts diff --git a/tools/ui/src/lib/enums/mcp.ts b/tools/ui/src/lib/enums/mcp.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/mcp.ts rename to tools/ui/src/lib/enums/mcp.enums.ts diff --git a/tools/ui/src/lib/enums/model.ts b/tools/ui/src/lib/enums/model.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/model.ts rename to tools/ui/src/lib/enums/model.enums.ts diff --git a/tools/ui/src/lib/enums/server.ts b/tools/ui/src/lib/enums/server.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/server.ts rename to tools/ui/src/lib/enums/server.enums.ts diff --git a/tools/ui/src/lib/enums/settings.ts b/tools/ui/src/lib/enums/settings.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/settings.ts rename to tools/ui/src/lib/enums/settings.enums.ts diff --git a/tools/ui/src/lib/enums/tools.ts b/tools/ui/src/lib/enums/tools.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/tools.ts rename to tools/ui/src/lib/enums/tools.enums.ts diff --git a/tools/ui/src/lib/enums/ui.ts b/tools/ui/src/lib/enums/ui.enums.ts similarity index 100% rename from tools/ui/src/lib/enums/ui.ts rename to tools/ui/src/lib/enums/ui.enums.ts diff --git a/tools/ui/src/lib/types/mcp.d.ts b/tools/ui/src/lib/types/mcp.d.ts index 7aa050cdf..2a2926142 100644 --- a/tools/ui/src/lib/types/mcp.d.ts +++ b/tools/ui/src/lib/types/mcp.d.ts @@ -1,5 +1,5 @@ -import type { MCPConnectionPhase, MCPLogLevel, HealthCheckStatus } from '$lib/enums/mcp'; -import type { ToolSource } from '$lib/enums/tools'; +import type { MCPConnectionPhase, MCPLogLevel, HealthCheckStatus } from '$lib/enums/mcp.enums'; +import type { ToolSource } from '$lib/enums/tools.enums'; import type { Client, ClientCapabilities as SDKClientCapabilities, diff --git a/tools/ui/src/routes/+layout.svelte b/tools/ui/src/routes/+layout.svelte index b35d20a5c..78227df3c 100644 --- a/tools/ui/src/routes/+layout.svelte +++ b/tools/ui/src/routes/+layout.svelte @@ -7,11 +7,13 @@ import { untrack } from 'svelte'; import { onMount } from 'svelte'; import { fade } from 'svelte/transition'; + import { DesktopIconStrip, DialogConversationTitleUpdate, SidebarNavigation } from '$lib/components/app'; + import { conversationsStore } from '$lib/stores/conversations.svelte'; import * as Sidebar from '$lib/components/ui/sidebar/index.js'; import * as Tooltip from '$lib/components/ui/tooltip'; @@ -30,26 +32,29 @@ import { conversations } from '$lib/stores/conversations.svelte'; let { children } = $props(); - let alwaysShowSidebarOnDesktop = $derived(config().alwaysShowSidebarOnDesktop); let isMobile = new IsMobile(); let isDesktop = $derived(!isMobile.current); let sidebarOpen = $state(false); let mounted = $state(false); let innerHeight = $state(); + let chatSidebar: - | { activateSearchMode?: () => void; editActiveConversation?: () => void } + | { + activateSearchMode?: () => void; + editActiveConversation?: () => void; + } | undefined = $state(); let titleUpdateDialogOpen = $state(false); let titleUpdateCurrentTitle = $state(''); let titleUpdateNewTitle = $state(''); let titleUpdateResolve: ((value: boolean) => void) | null = null; - const panelNav = useSettingsNavigation(); function navigateToConversation(direction: -1 | 1) { const allConvs = conversations(); + if (allConvs.length === 0) return; const currentId = page.params.id; @@ -61,6 +66,7 @@ } const idx = allConvs.findIndex((c) => c.id === currentId); + if (idx === -1) return; const targetIdx = idx + direction; @@ -75,9 +81,7 @@ // Global keyboard shortcuts const { handleKeydown } = useKeyboardShortcuts({ editActiveConversation: () => chatSidebar?.editActiveConversation?.(), - navigateToPrevConversation: () => navigateToConversation(-1), - navigateToNextConversation: () => navigateToConversation(1) }); @@ -139,6 +143,7 @@ $effect(() => { if (alwaysShowSidebarOnDesktop && isDesktop) { sidebarOpen = true; + return; } }); @@ -175,6 +180,7 @@ // Only fetch router models once when we have models loaded and in router mode if (isRouter && modelsCount > 0 && !routerModelsFetched) { routerModelsFetched = true; + untrack(() => { modelsStore.fetchRouterModels(); }); @@ -223,7 +229,6 @@ -
- - - + {#if !(alwaysShowSidebarOnDesktop && isDesktop) && !(panelNav.isSettingsRoute && !isDesktop)} {#if mounted} @@ -266,9 +271,9 @@ /> {/if} - - {@render children?.()} - + {@render children?.()}
diff --git a/tools/ui/vitest-setup-client.ts b/tools/ui/vitest-setup-client.ts index 0b753db02..90994442e 100644 --- a/tools/ui/vitest-setup-client.ts +++ b/tools/ui/vitest-setup-client.ts @@ -9,70 +9,72 @@ import { beforeEach, vi } from 'vitest'; beforeEach(() => { const originalFetch = globalThis.fetch; - vi.spyOn(globalThis, 'fetch').mockImplementation(async (input: RequestInfo | URL, init?: RequestInit) => { - const url = typeof input === 'string' ? input : input instanceof URL ? input.href : input.url; + vi.spyOn(globalThis, 'fetch').mockImplementation( + async (input: RequestInfo | URL, init?: RequestInit) => { + const url = typeof input === 'string' ? input : input instanceof URL ? input.href : input.url; - // Mock server props endpoint - if (url.includes('/server')) { - return new Response( - JSON.stringify({ - mode: 'router', - version: 'test', - git_commit: 'test', - git_branch: 'test' - }), - { status: 200, headers: { 'Content-Type': 'application/json' } } - ); + // Mock server props endpoint + if (url.includes('/server')) { + return new Response( + JSON.stringify({ + mode: 'router', + version: 'test', + git_commit: 'test', + git_branch: 'test' + }), + { status: 200, headers: { 'Content-Type': 'application/json' } } + ); + } + + // Mock models list endpoint + if (/\/v1\/models|\/models\b/.test(url)) { + return new Response( + JSON.stringify({ + object: 'list', + data: [ + { + id: 'test-model.gguf', + object: 'model', + owned_by: 'llamacpp', + created: 0, + in_cache: false, + path: 'models/test-model.gguf', + status: { value: 'unloaded' }, + meta: {} + } + ], + models: [ + { + model: 'test-model.gguf', + name: 'Test Model', + details: {} + } + ] + }), + { status: 200, headers: { 'Content-Type': 'application/json' } } + ); + } + + // Mock /props endpoint (used for modalities) + if (url.includes('/props')) { + return new Response( + JSON.stringify({ + default_generation_settings: { n_ctx: 2048 } + }), + { status: 200, headers: { 'Content-Type': 'application/json' } } + ); + } + + // Mock /tools endpoint (used for built-in tools list) + if (url.includes('/tools')) { + return new Response(JSON.stringify([]), { + status: 200, + headers: { 'Content-Type': 'application/json' } + }); + } + + // Default: use real fetch + return originalFetch(input, init); } - - // Mock models list endpoint - if (/\/v1\/models|\/models\b/.test(url)) { - return new Response( - JSON.stringify({ - object: 'list', - data: [ - { - id: 'test-model.gguf', - object: 'model', - owned_by: 'llamacpp', - created: 0, - in_cache: false, - path: 'models/test-model.gguf', - status: { value: 'unloaded' }, - meta: {} - } - ], - models: [ - { - model: 'test-model.gguf', - name: 'Test Model', - details: {} - } - ] - }), - { status: 200, headers: { 'Content-Type': 'application/json' } } - ); - } - - // Mock /props endpoint (used for modalities) - if (url.includes('/props')) { - return new Response( - JSON.stringify({ - default_generation_settings: { n_ctx: 2048 } - }), - { status: 200, headers: { 'Content-Type': 'application/json' } } - ); - } - - // Mock /tools endpoint (used for built-in tools list) - if (url.includes('/tools')) { - return new Response(JSON.stringify([]), { - status: 200, - headers: { 'Content-Type': 'application/json' } - }); - } - - // Default: use real fetch - return originalFetch(input, init); - }); + ); }); diff --git a/tools/ui/vitest.shims.d.ts b/tools/ui/vitest.shims.d.ts new file mode 100644 index 000000000..03b1801a6 --- /dev/null +++ b/tools/ui/vitest.shims.d.ts @@ -0,0 +1 @@ +/// From d14ce3dab4de197adec5166faa54ac5db8262f26 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 19 May 2026 15:32:58 +0300 Subject: [PATCH 20/33] llama : MTP clean-up (#23269) * llama : disable equal splits for recurrent memory with partial rollback * spec : re-enable p-min with MTP drafts * spec : re-enable ngram spec in combination with RS rollback * spec : fix ngram-map-* params * spec : fix acceptance logic in combined ngram + draft configs * graph : fix reuse for combined `token` + `embd` batches * spec : log parameters for each speculative implementation - add LOG_INF in each constructor with implementation type and parameters - extract device string logic into common_speculative_get_devices_str() - move 'adding speculative implementation' log from init into constructors Assisted-by: llama.cpp:local pi * spec : extend --spec-default with ngram-map-k4v Assisted-by: llama.cpp:local pi * minor : fix n_embd log * args : update draft.n_max == 3 + regen docs * spec : relax ngram-mod rejection thold to 0.25 @ 5 low * logs : improve * docs : update speculative decoding CLI argument documentation - Add missing draft model CPU scheduling and tensor override parameters - Update --spec-type to include all available types (excluding draft-eagle3 WIP) - Fix default values to match implementation (n_max=3, n_min=0, p_min=0.0) - Remove deprecated options (spec-draft-ctx-size, spec-draft-replace) - Add environment variables for new parameters Assisted-by: llama.cpp:local pi * arg : step-back on adding k4v to the default spec config * cont : fix name --- common/arg.cpp | 18 +++- common/common.cpp | 23 ----- common/common.h | 8 +- common/ngram-map.cpp | 2 +- common/speculative.cpp | 150 +++++++++++++++++++++++-------- docs/speculative.md | 77 +++++++++++++--- src/llama-graph.h | 3 +- src/llama-memory-hybrid-iswa.cpp | 12 ++- src/llama-memory-hybrid.cpp | 12 ++- src/llama-memory-recurrent.cpp | 12 ++- src/llama-memory-recurrent.h | 1 + src/models/delta-net-base.cpp | 97 ++++++++++++-------- src/models/models.h | 3 - tools/cli/README.md | 4 +- tools/server/README.md | 5 +- 15 files changed, 293 insertions(+), 134 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index ab23b77e0..13dfd4135 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -536,7 +536,11 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str())); } if (!seen_args.insert(arg).second) { - LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str()); + const bool skip = (arg == "--spec-type"); + + if (!skip) { + LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str()); + } } auto & tmp = arg_to_options[arg]; auto opt = *tmp.first; @@ -893,7 +897,11 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::mapsamplers_seq_config.size(); } - // [TAG_RS_STATE_ROLLBACK_SUPPORT] - // TODO: ngram speculative methods require checkpointing in addition to partial RS rollback - // currently this is not supported. so we disable the partial rollback - if (cparams.n_rs_seq > 0 && (llama_model_is_recurrent(model) || llama_model_is_hybrid(model))) { - auto & types = params.speculative.types; - - for (int i = 0; i < (int) types.size(); i++) { - if (types[i] == COMMON_SPECULATIVE_TYPE_NONE) { - continue; - } - if (types[i] == COMMON_SPECULATIVE_TYPE_DRAFT_MTP) { - continue; - } - - cparams.n_rs_seq = 0; - - LOG_WRN("%s: recurrent state rollback is not compatible with '%s' - disabling rollback support\n", __func__, - common_speculative_type_to_str(types[i]).c_str()); - - break; - } - } - llama_context * lctx = llama_init_from_model(model, cparams); if (lctx == NULL) { LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str()); diff --git a/common/common.h b/common/common.h index e03f70374..53c689bc1 100644 --- a/common/common.h +++ b/common/common.h @@ -299,11 +299,11 @@ struct common_params_model { // draft-model-based speculative decoding parameters struct common_params_speculative_draft { - int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding - int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding + int32_t n_max = 3; // maximum number of tokens to draft during speculative decoding + int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding - float p_split = 0.1f; // speculative decoding split probability - float p_min = 0.75f; // minimum speculative decoding probability (greedy) // TODO: change default to 0.0f + float p_split = 0.1f; // speculative decoding split probability + float p_min = 0.0f; // minimum speculative decoding probability (greedy) common_params_model mparams; diff --git a/common/ngram-map.cpp b/common/ngram-map.cpp index 02bc482fe..936415976 100644 --- a/common/ngram-map.cpp +++ b/common/ngram-map.cpp @@ -500,7 +500,7 @@ void common_ngram_map_draft(common_ngram_map & map, draft.push_back(inp[match_pos + n + i]); } - LOG_INF("%s: key_offset = %zu, slot_max = %d, key_num = %d, draft.size = %zu\n", __func__, + LOG_DBG("%s: key_offset = %zu, slot_max = %d, key_num = %d, draft.size = %zu\n", __func__, key_offset, slot_max, curr_key.key_num, draft.size()); diff --git a/common/speculative.cpp b/common/speculative.cpp index e591bab87..4d1b61a13 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -32,6 +32,19 @@ const std::map common_speculative_type_fro {"ngram-cache", COMMON_SPECULATIVE_TYPE_NGRAM_CACHE} }; +static std::string common_speculative_get_devices_str(const std::vector & devices) { + if (devices.empty()) { + return "default"; + } + + std::string result; + for (size_t i = 0; i < devices.size(); i++) { + if (i > 0) result += ", "; + result += ggml_backend_dev_name(devices[i]); + } + return result; +} + struct common_speculative_config { common_speculative_type type; common_params_speculative params; @@ -144,7 +157,7 @@ struct common_speculative_impl { virtual void draft(common_speculative_draft_params_vec & dparams) = 0; - virtual void accept(llama_seq_id seq_id, uint16_t n_accepted) = 0; + virtual void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) = 0; // true if this implementation requires the target context to extract post-norm embeddings virtual bool need_embd() const = 0; @@ -167,6 +180,16 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl { auto * ctx_dft = this->params.ctx_dft; auto * ctx_tgt = this->params.ctx_tgt; + LOG_INF("%s: adding speculative implementation 'draft-simple'\n", __func__); + LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min); + LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__, + this->params.n_gpu_layers, + ggml_type_name(this->params.cache_type_k), + ggml_type_name(this->params.cache_type_v), + ctx_tgt ? "yes" : "no", + ctx_dft ? "yes" : "no", + common_speculative_get_devices_str(this->params.devices).c_str()); + batch = llama_batch_init(llama_n_batch(ctx_dft), 0, 1); // TODO: optimize or pass from outside? @@ -343,7 +366,7 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl { } } - void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override { + void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override { // noop } @@ -355,8 +378,12 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl { struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { //common_params_speculative_eagle3 params; - common_speculative_impl_draft_eagle3(const common_params_speculative & /*params*/, uint32_t n_seq) - : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq) {} + common_speculative_impl_draft_eagle3(const common_params_speculative & params, uint32_t n_seq) + : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq) + { + LOG_INF("%s: adding speculative implementation 'draft-eagle3'\n", __func__); + LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min); + } void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override { // noop @@ -371,7 +398,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { // TODO: implement } - void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override { + void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override { // noop } @@ -380,7 +407,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { } }; -struct common_speculative_state_draft_mtp : public common_speculative_impl { +struct common_speculative_impl_draft_mtp : public common_speculative_impl { common_params_speculative_draft params; // reuses the draft-model params slot (ctx_tgt/ctx_dft) llama_batch batch; @@ -407,7 +434,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { // pre-advancement before process() mirrored the verify batch. std::vector last_n_drafted; - common_speculative_state_draft_mtp(const common_params_speculative & params, uint32_t n_seq) + common_speculative_impl_draft_mtp(const common_params_speculative & params, uint32_t n_seq) : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_MTP, n_seq) , params(params.draft) { @@ -417,6 +444,16 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { n_embd = llama_model_n_embd(llama_get_model(ctx_dft)); + LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__); + LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd); + LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__, + this->params.n_gpu_layers, + ggml_type_name(this->params.cache_type_k), + ggml_type_name(this->params.cache_type_v), + ctx_tgt ? "yes" : "no", + ctx_dft ? "yes" : "no", + common_speculative_get_devices_str(this->params.devices).c_str()); + const int32_t n_b = (int32_t) llama_n_batch(ctx_dft); batch = llama_batch_init(/*n_tokens=*/ n_b, /*embd=*/ n_embd, /*n_seq_max=*/ 1); // llama_batch_init allocates only one of token/embd; MTP needs both. @@ -427,7 +464,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { for (auto & s : smpls) { common_params_sampling sparams; sparams.no_perf = false; - sparams.top_k = 1; // TODO: re-enable top_k == 10 and utilize `p_min` spec param + sparams.top_k = 10; sparams.samplers = { COMMON_SAMPLER_TYPE_TOP_K }; s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams)); } @@ -446,7 +483,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { last_n_drafted.assign(n_seq, 0); } - ~common_speculative_state_draft_mtp() override { + ~common_speculative_impl_draft_mtp() override { if (batch.token != nullptr) { free(batch.token); batch.token = nullptr; @@ -462,7 +499,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { auto * ctx_dft = this->params.ctx_dft; const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id); if (pos_max < N - 1) { - LOG_WRN("%s: ctx_dft pos_max=%d < N-1=%d — " + LOG_WRN("%s: ctx_dft pos_max=%d < N-1=%d - " "process() hook may not have run on every prefill ubatch " "(need_embd / logits=1 on every prompt position?). " "Drafts may degrade.\n", @@ -633,6 +670,14 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { // add drafted token for each sequence const llama_token id = cur_p->data[0].id; + // only collect very high-confidence draft tokens + if (cur_p->data[0].p < params.p_min) { + drafting[seq_id] = false; + n_drafting--; + + continue; + } + common_sampler_accept(smpl, id, true); auto & dp = dparams.at(seq_id); @@ -678,7 +723,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { } } - void accept(llama_seq_id seq_id, uint16_t n_accepted) override { + void accept(llama_seq_id seq_id, uint16_t n_accepted, bool /*is_other*/) override { if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) { return; } @@ -714,7 +759,12 @@ struct common_speculative_impl_ngram_simple : public common_speculative_impl { common_ngram_simple_config config) : common_speculative_impl(COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, n_seq) , params(params.ngram_simple) - , config(config) {} + , config(config) + { + LOG_INF("%s: adding speculative implementation 'ngram-simple'\n", __func__); + LOG_INF("%s: - size_n=%d, size_m=%d, min_hits=%d\n", __func__, + this->params.size_n, this->params.size_m, this->params.min_hits); + } void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override { // noop @@ -738,7 +788,7 @@ struct common_speculative_impl_ngram_simple : public common_speculative_impl { } } - void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override { + void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override { // noop } @@ -748,20 +798,21 @@ struct common_speculative_impl_ngram_simple : public common_speculative_impl { }; struct common_speculative_impl_ngram_map_k : public common_speculative_impl { - common_params_speculative_ngram_map params; - // n_seq configs std::vector config; common_speculative_impl_ngram_map_k( - const common_params_speculative & params, const common_ngram_map & config, uint32_t n_seq) : common_speculative_impl(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, n_seq) - , params(params.ngram_map_k) { + { for (uint32_t i = 0; i < n_seq; i++) { this->config.push_back(config); } + + LOG_INF("%s: adding speculative implementation '%s'\n", __func__, common_speculative_type_to_str(this->type).c_str()); + LOG_INF("%s: - size_key=%d, size_value=%d, key_only=%d, min_hits=%d\n", __func__, + config.size_key, config.size_value, config.key_only, config.min_hits); } void begin(llama_seq_id seq_id, const llama_tokens & prompt) override { @@ -788,9 +839,13 @@ struct common_speculative_impl_ngram_map_k : public common_speculative_impl { } } - void accept(llama_seq_id seq_id, uint16_t n_accepted) override { + void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) override { GGML_ASSERT((seq_id < (llama_seq_id) config.size())); + if (is_other) { + return; + } + common_ngram_map_accept(config[seq_id], n_accepted); } @@ -812,7 +867,7 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl { // the last position in the prompt that was added to the ngram container size_t i_last = 0; - // length of the last drafted n‑gram (number of tokens returned by draft) + // length of the last drafted n-gram (number of tokens returned by draft) size_t n_draft_last = 0; // consecutive accept rounds with low acceptance fraction (< 0.5) @@ -830,8 +885,11 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl { , verbose(std::getenv("LLAMA_TRACE") != nullptr) { static_assert(sizeof(llama_token) == sizeof(common_ngram_mod::entry_t)); - LOG_INF("%s: initialized ngram_mod with n_match=%d, size=%zu (%.3f MB)\n", __func__, - this->params.n_match, mod.size(), (float)(mod.size_bytes())/1024/1024); + LOG_INF("%s: adding speculative implementation 'ngram-mod'\n", __func__); + LOG_INF("%s: - n_match=%d, n_max=%d, n_min=%d\n", __func__, + this->params.n_match, this->params.n_max, this->params.n_min); + LOG_INF("%s: - mod size=%zu (%.3f MB)\n", __func__, + mod.size(), (float)(mod.size_bytes())/1024/1024); if (this->params.n_match < 16) { LOG_WRN("%s: ngram_mod n_match=%d is too small - poor quality is possible, " @@ -921,7 +979,7 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl { } result.resize(result.size() - n); - // store length of drafted n‑gram for later acceptance analysis + // store length of drafted n-gram for later acceptance analysis sinfo.n_draft_last = result.size(); } @@ -943,17 +1001,21 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl { } } - void accept(llama_seq_id seq_id, uint16_t n_accepted) override { + void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) override { + if (is_other) { + return; + } + auto & sinfo = sinfos[seq_id]; // compute acceptance fraction if we have a recorded draft length if (sinfo.n_draft_last > 0) { const double f_acc = (double)n_accepted / (double)sinfo.n_draft_last; - if (f_acc < 0.5) { + if (f_acc < 0.25) { sinfo.n_low++; - if (sinfo.n_low >= 3) { + if (sinfo.n_low >= 5) { if (verbose) { - LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, sinfo.n_low); + LOG_WRN("%s: low acceptance streak (%d) - resetting ngram_mod\n", __func__, sinfo.n_low); } mod.reset(); @@ -1003,6 +1065,12 @@ struct common_speculative_impl_ngram_cache : public common_speculative_impl { , save_dynamic(save_dynamic) , save_static(save_static) { + LOG_INF("%s: adding speculative implementation 'ngram-cache'\n", __func__); + LOG_INF("%s: - n_draft=%d, cache_static=%s, cache_dynamic=%s\n", __func__, + n_draft, + path_static.empty() ? "none" : path_static.c_str(), + path_dynamic.empty() ? "none" : path_dynamic.c_str()); + sinfos.resize(n_seq); if (!path_static.empty()) { @@ -1099,7 +1167,7 @@ struct common_speculative_impl_ngram_cache : public common_speculative_impl { } } - void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override { + void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override { // noop } @@ -1285,7 +1353,6 @@ common_speculative * common_speculative_init(common_params_speculative & params, std::vector> impls = {}; for (const common_speculative_config & config : configs) { - LOG_INF("%s: adding speculative implementation '%s'\n", __func__, common_speculative_type_to_str(config.type).c_str()); switch (config.type) { case COMMON_SPECULATIVE_TYPE_NONE: break; @@ -1298,7 +1365,7 @@ common_speculative * common_speculative_init(common_params_speculative & params, break; } case COMMON_SPECULATIVE_TYPE_DRAFT_MTP: { - impls.push_back(std::make_unique(config.params, n_seq)); + impls.push_back(std::make_unique(config.params, n_seq)); break; } case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: { @@ -1319,11 +1386,16 @@ common_speculative * common_speculative_init(common_params_speculative & params, impls.push_back(std::move(state)); break; } - case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: + case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: { + impls.push_back( + std::make_unique( + get_common_ngram_map(config.type, config.params.ngram_map_k), n_seq)); + break; + } case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: { impls.push_back( std::make_unique( - config.params, get_common_ngram_map(config.type, config.params.ngram_map_k), n_seq)); + get_common_ngram_map(config.type, config.params.ngram_map_k4v), n_seq)); break; } case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: { @@ -1515,11 +1587,6 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u GGML_ASSERT(impl); - // TODO: currently only the implementation that generated the draft is used to accept it - // however, some implementations (such as MTP) need to also "see" the accepted tokens - // extend `common_speculative_impl::accept()` with an extra argument `bool is_other` to - // inform the implementation if the accepted tokens are from another implementation and - // pass the accepted tokens to all remaining implementations using `is_other == true` { common_time_meas tm(impl->t_accept_us, !impl->gen_perf); if (n_accepted > 0) { @@ -1527,9 +1594,16 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u impl->n_acc_tokens += n_accepted; } - impl->accept(seq_id, n_accepted); + impl->accept(seq_id, n_accepted, false); impl->n_call_accept++; } + + // accept with the rest of the implementations, using is_other == true + for (auto & impl_other : spec->impls) { + if (impl_other.get() != impl) { + impl_other->accept(seq_id, n_accepted, true); + } + } } void common_speculative_print_stats(const common_speculative * spec) { @@ -1549,7 +1623,7 @@ void common_speculative_print_stats(const common_speculative * spec) { str_perf = ""; } - LOG_INF("statistics %s: #calls(b,g,a) = %zu %zu %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n", + LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s\n", common_speculative_type_to_str(impl->type).c_str(), impl->n_call_begin, impl->n_call_draft, impl->n_call_accept, impl->n_gen_drafts, diff --git a/docs/speculative.md b/docs/speculative.md index fb6ef0306..45e42d42a 100644 --- a/docs/speculative.md +++ b/docs/speculative.md @@ -108,11 +108,12 @@ If a draft model is combined with a draftless decoding the draftless decoding ha ### General Speculative Parameters ``` ---spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod] - type of speculative decoding to use when no draft model is provided +--spec-type [none|draft-simple|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod] + comma-separated list of types of speculative decoding to use (default: none) (env: LLAMA_ARG_SPEC_TYPE) ---spec-default use default speculative decoding +--spec-default use default speculative decoding config + (enables ngram-mod) ``` ### Draft Model Parameters @@ -123,8 +124,9 @@ If a draft model is combined with a draftless decoding the draftless decoding ha (env: LLAMA_ARG_SPEC_DRAFT_MODEL) --spec-draft-hf, -hfd, -hfrd, --hf-repo-draft /[:quant] HuggingFace repository for the draft model + (env: LLAMA_ARG_SPEC_DRAFT_HF_REPO) --spec-draft-n-max N - number of tokens to draft for speculative decoding (default: 16) + number of tokens to draft for speculative decoding (default: 3) (env: LLAMA_ARG_SPEC_DRAFT_N_MAX) --spec-draft-n-min N minimum number of draft tokens to use for speculative decoding (default: 0) @@ -133,18 +135,64 @@ If a draft model is combined with a draftless decoding the draftless decoding ha speculative decoding split probability (default: 0.10) (env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) --spec-draft-p-min, --draft-p-min P - minimum speculative decoding probability (greedy) (default: 0.75) + minimum speculative decoding probability (greedy) (default: 0.00) (env: LLAMA_ARG_SPEC_DRAFT_P_MIN) ---spec-draft-ctx-size, -cd, --ctx-size-draft N - size of the prompt context for the draft model (default: 0, 0 = loaded from model) - (env: LLAMA_ARG_SPEC_DRAFT_CTX_SIZE) --spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto) (env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) --spec-draft-device, -devd, --device-draft comma-separated list of devices to use for offloading the draft model ---spec-draft-replace, --spec-replace TARGET DRAFT - translate the string in TARGET into DRAFT if the draft model and main model are not compatible + (use --list-devices to see available devices) +``` + +### Draft Model CPU Scheduling Parameters + +``` +--spec-draft-threads, -td, --threads-draft N + number of CPU threads to use during generation +--spec-draft-threads-batch, -tbd, --threads-batch-draft N + number of threads to use during batch and prompt processing (default: same as --threads-draft) +--spec-draft-cpu-mask, -Cd, --cpu-mask-draft M + Draft model CPU affinity mask. Complements cpu-range-draft +--spec-draft-cpu-range, -Crd, --cpu-range-draft lo-hi + Ranges of CPUs for affinity. Complements --cpu-mask-draft +--spec-draft-cpu-strict, --cpu-strict-draft <0|1> + Use strict CPU placement for draft model (default: same as --cpu-strict) +--spec-draft-prio, --prio-draft N + set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime +--spec-draft-poll, --poll-draft <0|1> + Use polling to wait for draft model work (default: same as --poll) +--spec-draft-cpu-mask-batch, -Cbd, --cpu-mask-batch-draft M + Draft model CPU affinity mask for batch. Complements cpu-range-batch-draft +--spec-draft-cpu-range-batch, -Crbd, --cpu-range-batch-draft lo-hi + Ranges of CPUs for affinity for batch. Complements --cpu-mask-batch-draft +--spec-draft-cpu-strict-batch, --cpu-strict-batch-draft <0|1> + Use strict CPU placement for draft model batch (default: --cpu-strict-draft) +--spec-draft-prio-batch, --prio-batch-draft N + set draft process/thread priority for batch : 0-normal, 1-medium, 2-high, 3-realtime +--spec-draft-poll-batch, --poll-batch-draft <0|1> + Use polling to wait for draft model work for batch (default: --poll-draft) +``` + +### Draft Model KV Cache and Tensor Override Parameters + +``` +--spec-draft-type-k, -ctkd, --cache-type-k-draft TYPE + KV cache data type for K for the draft model + allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1 + (env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_K) +--spec-draft-type-v, -ctvd, --cache-type-v-draft TYPE + KV cache data type for V for the draft model + allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1 + (env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_V) +--spec-draft-override-tensor, -otd, --override-tensor-draft =,... + override tensor buffer type for draft model +--spec-draft-cpu-moe, -cmoed, --cpu-moe-draft + keep all Mixture of Experts (MoE) weights in the CPU for the draft model + (env: LLAMA_ARG_SPEC_DRAFT_CPU_MOE) +--spec-draft-n-cpu-moe, --spec-draft-ncmoe, -ncmoed, --n-cpu-moe-draft N + keep the MoE weights of the first N layers in the CPU for the draft model + (env: LLAMA_ARG_SPEC_DRAFT_N_CPU_MOE) ``` ### n-gram Mod Parameters @@ -193,11 +241,13 @@ If a draft model is combined with a draftless decoding the draftless decoding ha ### `--spec-type TYPE` -Specifies a type of speculative decoding without draft model. +Specifies a comma-separated list of speculative decoding types to use. | Type | Description | |------|-------------| | `none` | No speculative decoding (default) | +| `draft-simple` | Use a simple draft model for speculation | +| `draft-mtp` | Use Masked Token Prediction (MTP) heads from the main model | | `ngram-cache` | Use n-gram cache lookup | | `ngram-simple` | Use simple n-gram pattern matching | | `ngram-map-k` | Use n-gram pattern matching with n-gram-keys | @@ -209,6 +259,11 @@ Specifies a type of speculative decoding without draft model. ./llama-server [...] --spec-type ngram-simple ``` +**Example:** Multiple speculative implementations. +```bash +./llama-server [...] --spec-type ngram-mod,ngram-map-k4v +``` + ### `--spec-ngram-*-size-n N` Sets the size N of the lookup n-gram for n-gram map based speculative decoding. diff --git a/src/llama-graph.h b/src/llama-graph.h index 9e55d0a67..bf6778237 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -581,7 +581,8 @@ struct llm_graph_params { ubatch.n_seqs_unq == other.ubatch.n_seqs_unq && ( (!ubatch.token && !other.ubatch.token) || - (!ubatch.embd && !other.ubatch.embd) + (!ubatch.embd && !other.ubatch.embd) || + (ubatch.token && other.ubatch.token && ubatch.embd && other.ubatch.embd) ); // when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same diff --git a/src/llama-memory-hybrid-iswa.cpp b/src/llama-memory-hybrid-iswa.cpp index a59561ea5..72f5c2fea 100644 --- a/src/llama-memory-hybrid-iswa.cpp +++ b/src/llama-memory-hybrid-iswa.cpp @@ -75,9 +75,15 @@ llama_memory_context_ptr llama_memory_hybrid_iswa::init_batch(llama_batch_allocr // if all tokens are output, split by sequence ubatch = balloc.split_seq(n_ubatch); } else { - // Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice) - const bool unified = (mem_attn->get_base()->get_n_stream() == 1); - ubatch = balloc.split_equal(n_ubatch, !unified); + if (mem_recr->n_rs_seq > 0) { + // [TAG_RECURRENT_ROLLBACK_SPLITS] + // TODO: recurrent state rollback does not support equal splits + ubatch = balloc.split_seq(n_ubatch); + } else { + // Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice) + const bool unified = (mem_attn->get_base()->get_n_stream() == 1); + ubatch = balloc.split_equal(n_ubatch, !unified); + } } if (ubatch.n_tokens == 0) { diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp index fd305cab7..33b3b395e 100644 --- a/src/llama-memory-hybrid.cpp +++ b/src/llama-memory-hybrid.cpp @@ -75,9 +75,15 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba // if all tokens are output, split by sequence ubatch = balloc.split_seq(n_ubatch); } else { - // Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice) - const bool unified = (mem_attn->get_n_stream() == 1); - ubatch = balloc.split_equal(n_ubatch, !unified); + if (mem_recr->n_rs_seq > 0) { + // [TAG_RECURRENT_ROLLBACK_SPLITS] + // TODO: recurrent state rollback does not support equal splits + ubatch = balloc.split_seq(n_ubatch); + } else { + // Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice) + const bool unified = (mem_attn->get_n_stream() == 1); + ubatch = balloc.split_equal(n_ubatch, !unified); + } } if (ubatch.n_tokens == 0) { diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp index aeb866657..ec5dc5835 100644 --- a/src/llama-memory-recurrent.cpp +++ b/src/llama-memory-recurrent.cpp @@ -416,9 +416,15 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & // if all tokens are output, split by sequence ubatch = balloc.split_seq(n_ubatch); } else { - // TODO: non-sequential equal split can be done if using unified KV cache - // for simplicity, we always use sequential equal split for now - ubatch = balloc.split_equal(n_ubatch, true); + if (n_rs_seq > 0) { + // [TAG_RECURRENT_ROLLBACK_SPLITS] + // TODO: recurrent state rollback does not support equal splits + ubatch = balloc.split_seq(n_ubatch); + } else { + // TODO: non-sequential equal split can be done if using unified KV cache + // for simplicity, we always use sequential equal split for now + ubatch = balloc.split_equal(n_ubatch, true); + } } if (ubatch.n_tokens == 0) { diff --git a/src/llama-memory-recurrent.h b/src/llama-memory-recurrent.h index 29c58afc9..b13b7b748 100644 --- a/src/llama-memory-recurrent.h +++ b/src/llama-memory-recurrent.h @@ -72,6 +72,7 @@ public: // number of recurrent-state snapshots per seq for rollback; tensors are widened to (1 + n_rs_seq) groups uint32_t n_rs_seq = 0; + // per-seq rollback index std::vector rs_idx; diff --git a/src/models/delta-net-base.cpp b/src/models/delta-net-base.cpp index 2a4e00384..a67238383 100644 --- a/src/models/delta-net-base.cpp +++ b/src/models/delta-net-base.cpp @@ -447,13 +447,6 @@ std::pair llm_build_delta_net_base::build_delta_ne return build_delta_net_chunking(q, k, v, g, b, s, il); } -bool llm_build_delta_net_base::keep_rs() const { - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - return cparams.n_rs_seq > 0 - && n_seq_tokens > 1 - && (uint32_t) n_seq_tokens <= 1 + cparams.n_rs_seq; -} - ggml_tensor * llm_build_delta_net_base::build_conv_state( llm_graph_input_rs * inp, ggml_tensor * conv_states_all, @@ -461,12 +454,12 @@ ggml_tensor * llm_build_delta_net_base::build_conv_state( int64_t conv_kernel_size, int64_t conv_channels, int il) { - const auto * mctx_cur = inp->mctx; - const auto kv_head = mctx_cur->get_head(); - const uint32_t mem_size = mctx_cur->get_size(); - const int64_t n_seqs = ubatch.n_seqs; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const bool keep = keep_rs(); + const auto * mctx_cur = inp->mctx; + + const auto kv_head = mctx_cur->get_head(); + const auto mem_size = mctx_cur->get_size(); + + const int64_t n_seqs = ubatch.n_seqs; ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs); cb(conv_states, "conv_states", il); @@ -480,32 +473,52 @@ ggml_tensor * llm_build_delta_net_base::build_conv_state( ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0); cb(conv_input, "conv_input", il); - if (!keep) { - ggml_tensor * last_conv_states = - ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs, conv_input->nb[1], - conv_input->nb[2], (conv_input->ne[0] - conv_states->ne[0]) * ggml_element_size(conv_input)); - cb(last_conv_states, "last_conv_states", il); + const int64_t row_count = (conv_kernel_size - 1) * conv_channels; - ggml_tensor * state_update_target = - ggml_view_2d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels, n_seqs, conv_states_all->nb[1], - kv_head * (conv_kernel_size - 1) * conv_channels * ggml_element_size(conv_states_all)); - cb(state_update_target, "state_update_target", il); + const size_t row_size = ggml_row_size(conv_states_all->type, row_count); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target)); + if (cparams.n_rs_seq == 0) { + const int64_t s_idx = conv_input->ne[0] - conv_states->ne[0]; + const int64_t s_slot = 0; + + ggml_tensor * conv_state_last = + ggml_view_3d(ctx0, conv_input, + conv_kernel_size - 1, conv_channels, n_seqs, + conv_input->nb[1], conv_input->nb[2], + ggml_row_size(conv_input->type, s_idx)); + cb(conv_state_last, "conv_state_last", il); + + ggml_tensor * conv_state_update = + ggml_view_2d(ctx0, conv_states_all, + row_count, n_seqs, conv_states_all->nb[1], + (s_slot * mem_size + kv_head) * row_size); + cb(conv_state_update, "conv_state_update", il); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_state_last, conv_state_update)); } else { - const int64_t row_count = (conv_kernel_size - 1) * conv_channels; - const size_t row_size = row_count * ggml_element_size(conv_states_all); - for (int64_t t = 1; t <= n_seq_tokens; ++t) { - const uint32_t slot = (uint32_t)(n_seq_tokens - t); - ggml_tensor * src = - ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs, - conv_input->nb[1], conv_input->nb[2], - t * ggml_element_size(conv_input)); - ggml_tensor * dst = - ggml_view_2d(ctx0, conv_states_all, row_count, n_seqs, - conv_states_all->nb[1], - ((size_t) slot * mem_size + kv_head) * row_size); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, src, dst)); + // [TAG_RECURRENT_ROLLBACK_SPLITS] + // TODO: this logic incorrectly assumes that the last (n_rs_seq + 1) tokens of a sequence in a batch are + // inside the same ubatch. currently with `split_equal()` this is not correct + + const int64_t K = (int64_t) cparams.n_rs_seq + 1; + + for (int64_t t = 1; t <= K; ++t) { + const int64_t s_idx = std::max(0, conv_input->ne[0] - conv_states->ne[0] - K + t); + const int64_t s_slot = K - t; + + ggml_tensor * conv_state_last = + ggml_view_3d(ctx0, conv_input, + conv_kernel_size - 1, conv_channels, n_seqs, + conv_input->nb[1], conv_input->nb[2], + ggml_row_size(conv_input->type, s_idx)); + + ggml_tensor * conv_state_update = + ggml_view_2d(ctx0, + conv_states_all, row_count, n_seqs, + conv_states_all->nb[1], + (s_slot * mem_size + kv_head) * row_size); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_state_last, conv_state_update)); } } @@ -531,7 +544,9 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn( const int64_t n_seqs = s->ne[3]; const int64_t n_seq_tokens = q->ne[2]; - if (!keep_rs()) { + const bool keep = cparams.n_rs_seq > 0; + + if (!keep) { auto attn_out = build_delta_net(q, k, v, g, b, s, il); ggml_tensor * output = attn_out.first; ggml_tensor * new_state = attn_out.second; @@ -554,7 +569,11 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn( ggml_tensor * state_3d = ggml_pad(ctx0, state_in_3d, 0, K - 1, 0, 0); ggml_tensor * gdn_out = ggml_gated_delta_net(ctx0, q, k, v, g, b, state_3d); - cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_CH, il); + if (n_seq_tokens > 1) { + cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_CH, il); + } else { + cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_AR, il); + } const int64_t attn_score_elems = S_v * H_v * n_seq_tokens * n_seqs; const int64_t state_size_per_snap = S_v * S_v * H_v * n_seqs; @@ -576,9 +595,11 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn( ggml_row_size(gdn_out->type, S_v * S_v), ggml_row_size(gdn_out->type, S_v * S_v * H_v), ggml_row_size(gdn_out->type, attn_score_elems + k_i * state_size_per_snap)); + ggml_tensor * dst = ggml_view_2d(ctx0, ssm_states_all, hparams.n_embd_s(), n_seqs, ssm_states_all->nb[1], ((size_t) cache_slot * mem_size + kv_head) * row_size); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, src, dst)); } diff --git a/src/models/models.h b/src/models/models.h index 4e40536a5..7e551eb96 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -66,9 +66,6 @@ struct llm_build_delta_net_base : public llm_graph_context { ggml_tensor * s, int il); - // true when speculative rollback is enabled and the batch fits in the rs cache - bool keep_rs() const; - // read conv state from cache, concat with qkv_mixed, write back (single slot or per-token) // qkv_mixed: (qkv_dim, n_seq_tokens, n_seqs); returns conv_input: (kernel_size + n_seq_tokens - 1, channels, n_seqs) ggml_tensor * build_conv_state( diff --git a/tools/cli/README.md b/tools/cli/README.md index c40b5a21c..38bc78a3f 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -191,10 +191,10 @@ | `--spec-draft-override-tensor, -otd, --override-tensor-draft =,...` | override tensor buffer type for draft model | | `--spec-draft-cpu-moe, -cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model
(env: LLAMA_ARG_SPEC_DRAFT_CPU_MOE) | | `--spec-draft-n-cpu-moe, --spec-draft-ncmoe, -ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model
(env: LLAMA_ARG_SPEC_DRAFT_N_CPU_MOE) | -| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) | +| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 3)
(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) | | `--spec-draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)
(env: LLAMA_ARG_SPEC_DRAFT_N_MIN) | | `--spec-draft-p-split, --draft-p-split P` | speculative decoding split probability (default: 0.10)
(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) | -| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) | +| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.00)
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) | | `--spec-draft-device, -devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices | | `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | | `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_SPEC_DRAFT_MODEL) | diff --git a/tools/server/README.md b/tools/server/README.md index 11098af28..9b4134239 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -183,6 +183,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MAX_TOKENS) | | `-a, --alias STRING` | set model name aliases, comma-separated (to be used by API)
(env: LLAMA_ARG_ALIAS) | | `--tags STRING` | set model tags, comma-separated (informational, not used for routing)
(env: LLAMA_ARG_TAGS) | +| `--embd-normalize N` | normalisation for embeddings (default: 2) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) | | `--host HOST` | ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: 127.0.0.1)
(env: LLAMA_ARG_HOST) | | `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) | | `--reuse-port` | allow multiple sockets to bind to the same port (default: disabled)
(env: LLAMA_ARG_REUSE_PORT) | @@ -244,10 +245,10 @@ For the full list of features, please refer to [server's changelog](https://gith | `--spec-draft-override-tensor, -otd, --override-tensor-draft =,...` | override tensor buffer type for draft model | | `--spec-draft-cpu-moe, -cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model
(env: LLAMA_ARG_SPEC_DRAFT_CPU_MOE) | | `--spec-draft-n-cpu-moe, --spec-draft-ncmoe, -ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model
(env: LLAMA_ARG_SPEC_DRAFT_N_CPU_MOE) | -| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) | +| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 3)
(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) | | `--spec-draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)
(env: LLAMA_ARG_SPEC_DRAFT_N_MIN) | | `--spec-draft-p-split, --draft-p-split P` | speculative decoding split probability (default: 0.10)
(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) | -| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) | +| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.00)
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) | | `--spec-draft-device, -devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices | | `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | | `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_SPEC_DRAFT_MODEL) | From baf3cc6e1d70ce73f66a0665811e5c2228cddc5d Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 19 May 2026 18:41:44 +0200 Subject: [PATCH 21/33] model : clarify MTP layer comment in qwen35.cpp [no ci] (#23338) This commit attempts to clarify a code comment in graph_mtp regarding where the MTP layer is stored. The motivation for this is that it was not obvious to me what the original comment meant and hopefully this makes it clearer. --- src/models/qwen35.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index 361d7538a..35a0158e8 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -496,7 +496,8 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr const int64_t n_embd_head = hparams.n_embd_head_v(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); - // The MTP block lives at the source file's original layer index. + // hparams.n_layer includes both main model layers and MTP layers. The MTP + // layer is stored immediately after the main layers in model.layers[]. const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers; const auto & layer = model.layers[il]; From ac76808e4db7bbb4082b86e7fbd615934b44ac6e Mon Sep 17 00:00:00 2001 From: Aparna M P Date: Tue, 19 May 2026 22:18:21 +0530 Subject: [PATCH 22/33] hexagon: enable support for NORM op (#23319) --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 5 +- ggml/src/ggml-hexagon/htp/htp-ops.h | 1 + ggml/src/ggml-hexagon/htp/main.c | 1 + ggml/src/ggml-hexagon/htp/unary-ops.c | 97 ++++++++++++++++++++++++++ 4 files changed, 101 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 2f75e97ac..ebeef3bdb 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2870,6 +2870,7 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) { case GGML_OP_SET_ROWS: return HTP_OP_SET_ROWS; case GGML_OP_SUM_ROWS: return HTP_OP_SUM_ROWS; case GGML_OP_ARGSORT: return HTP_OP_ARGSORT; + case GGML_OP_NORM: return HTP_OP_NORM; case GGML_OP_L2_NORM: return HTP_OP_L2_NORM; case GGML_OP_RMS_NORM: return HTP_OP_RMS_NORM; case GGML_OP_SCALE: return HTP_OP_SCALE; @@ -3338,10 +3339,8 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons supp = ggml_hexagon_supported_add_id(sess, op); break; + case GGML_OP_NORM: case GGML_OP_L2_NORM: - supp = ggml_hexagon_supported_unary(sess, op); - break; - case GGML_OP_RMS_NORM: case GGML_OP_SCALE: supp = ggml_hexagon_supported_unary(sess, op); diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h index 676e948a4..9d905a301 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ops.h +++ b/ggml/src/ggml-hexagon/htp/htp-ops.h @@ -88,6 +88,7 @@ enum htp_op_code { HTP_OP_GATED_DELTA_NET, HTP_OP_TRI, HTP_OP_PAD, + HTP_OP_NORM, HTP_OP_INVALID }; diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c index 12003c1fd..8e54536f6 100644 --- a/ggml/src/ggml-hexagon/htp/main.c +++ b/ggml/src/ggml-hexagon/htp/main.c @@ -534,6 +534,7 @@ static int execute_op(struct htp_ops_context * octx) { case HTP_OP_ADD_ID: return op_binary(octx); + case HTP_OP_NORM: case HTP_OP_RMS_NORM: case HTP_OP_SCALE: case HTP_OP_SQR: diff --git a/ggml/src/ggml-hexagon/htp/unary-ops.c b/ggml/src/ggml-hexagon/htp/unary-ops.c index 1ce881353..40d2d6015 100644 --- a/ggml/src/ggml-hexagon/htp/unary-ops.c +++ b/ggml/src/ggml-hexagon/htp/unary-ops.c @@ -158,6 +158,79 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src, } } +static void hvx_fast_norm_f32(const uint8_t * restrict src, + uint8_t * restrict dst, + uint8_t * restrict pad, + const int num_elems, + float epsilon) { + (void)pad; + + const HVX_Vector * restrict v_src = (HVX_Vector *) src; + HVX_Vector * restrict v_dst = (HVX_Vector *) dst; + + const int nvec = num_elems / VLEN_FP32; // number of full vectors + const int nloe = num_elems % VLEN_FP32; // leftover elements + + // Compute sum of squares and sum of values for full vectors + HVX_Vector sum_sq_v = Q6_V_vsplat_R(0x00000000); + HVX_Vector sum_x_v = Q6_V_vsplat_R(0x00000000); + HVX_Vector epsilon_v = hvx_vec_splat_f32(epsilon); + + #pragma unroll(4) + for (int i = 0; i < nvec; i++) { + HVX_Vector v1 = v_src[i]; + HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1); + sum_sq_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_sq_v, v2); + sum_x_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_x_v, Q6_Vqf32_vadd_VsfVsf(v1, Q6_V_vzero())); + } + + // Handle tail elements using vectorized ops with masking + if (nloe > 0) { + HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4); + HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]); + HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1); + sum_sq_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_sq_v, v2); + sum_x_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_x_v, Q6_Vqf32_vadd_VsfVsf(v1, Q6_V_vzero())); + } + + // Reduce HVX sums + sum_sq_v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_sq_v)); + sum_x_v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_x_v)); + + HVX_Vector t_v = hvx_vec_splat_f32((float) num_elems); + HVX_Vector denom_v = hvx_vec_inverse_f32(t_v); + HVX_Vector mean_sq_v = Q6_Vqf32_vmpy_VsfVsf(sum_sq_v, denom_v); + HVX_Vector mean_x_v = Q6_Vqf32_vmpy_VsfVsf(sum_x_v, denom_v); + HVX_Vector mean_x_sq_v = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(mean_x_v), Q6_Vsf_equals_Vqf32(mean_x_v)); + HVX_Vector var_v = Q6_Vqf32_vsub_Vqf32Vqf32(mean_sq_v, mean_x_sq_v); + HVX_Vector var_epsilon_v = Q6_Vqf32_vadd_Vqf32Vsf(var_v, epsilon_v); + + // scale = rsqrt(variance + epsilon), mean_x broadcast for subtraction + HVX_Vector scale_v = hvx_vec_rsqrt_f32(Q6_Vsf_equals_Vqf32(var_epsilon_v)); + HVX_Vector mean_x_b = hvx_vec_splat_f32(hvx_vec_get_f32(Q6_Vsf_equals_Vqf32(mean_x_v))); + + #pragma unroll(4) + for (int i = 0; i < nvec; i++) { + HVX_Vector v1 = v_src[i]; + HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v1, mean_x_b); + HVX_Vector v3 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v2), scale_v); + v_dst[i] = Q6_Vsf_equals_Vqf32(v3); + } + + // Handle tail elements using vectorized ops with masking + if (nloe > 0) { + + HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4); + HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]); + HVX_Vector v2 = Q6_Vqf32_vsub_VsfVsf(v1, mean_x_b); + HVX_Vector v3 = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(v2), scale_v); + HVX_Vector result = Q6_Vsf_equals_Vqf32(v3); + + // Store with masking to avoid overwriting memory beyond the tensor + hvx_vec_store_a(&v_dst[nvec], nloe * 4, result); + } +} + static void scale_f32(const float * restrict src, float * restrict dst, uint8_t * restrict spad, @@ -196,6 +269,24 @@ static void rms_norm_f32(const float * restrict src, } } +static void norm_f32(const float * restrict src, + float * restrict dst, + uint8_t * restrict spad, + const uint32_t num_rows, + const uint32_t row_elems, + const size_t row_size, + int32_t * op_params) { + float epsilon = 0.f; + memcpy(&epsilon, op_params, sizeof(float)); + + for (uint32_t ir = 0; ir < num_rows; ir++) { + const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size); + uint8_t * restrict dst_local = (uint8_t *)dst + (ir * row_size); + + hvx_fast_norm_f32((const uint8_t *) src_local, (uint8_t *) dst_local, spad, row_elems, epsilon); + } +} + static void sqr_f32(const float * restrict src, float * restrict dst, uint8_t * restrict spad, @@ -556,6 +647,9 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * // Process block in VTCM switch (htp_op) { + case HTP_OP_NORM: + norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params); + break; case HTP_OP_RMS_NORM: rms_norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params); break; @@ -632,6 +726,9 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) { const char * op_type = NULL; switch (octx->op) { + case HTP_OP_NORM: + op_type = "norm-f32"; + break; case HTP_OP_RMS_NORM: op_type = "rmsnorm-f32"; break; From b7393a4d198011190ecf6420893febb997b84520 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Tue, 19 May 2026 21:16:58 +0200 Subject: [PATCH 23/33] convert : update mtp related help (#23334) * update mtp related help * remove outdated experimental text --- convert_hf_to_gguf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ff8400508..1d18a1bf9 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -115,15 +115,15 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument( "--mmproj", action="store_true", - help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.", + help="Export multimodal projector (mmproj) for vision models. This will only work on some vision models. An 'mmproj-' prefix will be added to the output file name.", ) parser.add_argument( "--mtp", action="store_true", - help="(Experimental) Export only the multi-token prediction (MTP) head as a separate GGUF, suitable for use as a speculative draft. Output file name will get a '-MTP' suffix.", + help="Export only the multi-token prediction (MTP) head as a separate GGUF, suitable for use as a speculative draft. An 'mtp-' prefix will be added to the output file name.", ) parser.add_argument( "--no-mtp", action="store_true", - help="(Experimental) Exclude the multi-token prediction (MTP) head from the converted GGUF. Pair with --mtp on a second run to publish trunk and MTP as two files. Note: the split form duplicates embeddings, so the bundled default is more space-efficient overall.", + help="Exclude the multi-token prediction (MTP) head from the converted GGUF. Pair with --mtp on a second run to publish trunk and MTP as two files. Note: the split form duplicates embeddings, but even though the bundled default is more space-efficient overall, this allows differing quantization which may be more performant.", ) parser.add_argument( "--mistral-format", action="store_true", From 7256fce047b4fda9a4d82e659e3ff7ebb11e3bbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 19 May 2026 21:33:23 +0200 Subject: [PATCH 24/33] common: fix --fit verbosity with --verbosity 4 (#23282) --- common/common.cpp | 4 ++-- tools/fit-params/fit-params.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index b6fdec3ce..d77ddeda1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1173,7 +1173,7 @@ common_init_result::common_init_result(common_params & params, bool model_only) params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx, - params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR); + params.verbosity >= LOG_LEVEL_DEBUG ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR); } llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams); @@ -1366,7 +1366,7 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode } if (params.warmup) { - LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); + LOG_INF("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); llama_set_warmup(lctx, true); diff --git a/tools/fit-params/fit-params.cpp b/tools/fit-params/fit-params.cpp index bcdf44040..20a5ff1eb 100644 --- a/tools/fit-params/fit-params.cpp +++ b/tools/fit-params/fit-params.cpp @@ -30,7 +30,7 @@ int main(int argc, char ** argv) { if (!params.fit_params_print) { const common_params_fit_status status = common_fit_params(params.model.path.c_str(), &mparams, &cparams, params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx, - params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR); + params.verbosity >= LOG_LEVEL_DEBUG ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR); if (status != COMMON_PARAMS_FIT_STATUS_SUCCESS) { LOG_ERR("%s: failed to fit CLI arguments to free memory, exiting...\n", __func__); exit(1); From 57cb35c8867fce383140d9b34a811920daed8c46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 19 May 2026 21:34:04 +0200 Subject: [PATCH 25/33] common: fix --help for --verbosity (#23278) --- common/arg.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/arg.cpp b/common/arg.cpp index 13dfd4135..87462f49e 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3364,7 +3364,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex " - 1: error\n" " - 2: warning\n" " - 3: info\n" - " - 4: debug\n" + " - 4: trace (more info)\n" + " - 5: debug\n" "(default: %d)\n", params.verbosity), [](common_params & params, int value) { params.verbosity = value; From a8078675a6a06584954a74beb654ae1b4b80041d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Tue, 19 May 2026 21:35:10 +0200 Subject: [PATCH 26/33] github: mention --log-file in issue templates (#23277) --- .github/ISSUE_TEMPLATE/011-bug-results.yml | 4 ++-- .github/ISSUE_TEMPLATE/019-bug-misc.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/011-bug-results.yml b/.github/ISSUE_TEMPLATE/011-bug-results.yml index c7001edf0..23150d0b6 100644 --- a/.github/ISSUE_TEMPLATE/011-bug-results.yml +++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml @@ -100,8 +100,8 @@ body: label: Relevant log output description: > Please copy and paste any relevant log output, including the command that you entered and any generated text. - For very long logs (thousands of lines), preferably upload them as files instead. - On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command. + For very long logs (thousands of lines), please upload them as files instead; the `--log-file` CLI argument can be used for this purpose. + On Linux you can alternatively redirect the console output of any command into a file by appending ` > llama.log 2>&1` to your command. value: |
Logs diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml index 831c98eb6..041a7cdb2 100644 --- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml +++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml @@ -88,8 +88,8 @@ body: description: > If applicable, please copy and paste any relevant log output, including any generated text. If you are encountering problems specifically with the `llama_params_fit` module, always upload `--verbose` logs as well. - For very long logs (thousands of lines), please upload them as files instead. - On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command. + For very long logs (thousands of lines), please upload them as files instead; the `--log-file` CLI argument can be used for this purpose. + On Linux you can alternatively redirect the console output of any command into a file by appending ` > llama.log 2>&1` to your command. value: |
Logs From 67ace021da905e27ecbdf1176b0eef578a5288c0 Mon Sep 17 00:00:00 2001 From: Aleksander Grygier Date: Tue, 19 May 2026 22:38:42 +0200 Subject: [PATCH 27/33] refactor: Chat Screen UI rendering (#23333) --- .../app/chat/ChatScreen/ChatScreen.svelte | 69 +++++++------------ .../ChatScreenActionScrollDown.svelte | 15 +++- .../chat/ChatScreen/ChatScreenGreeting.svelte | 25 +++++++ .../ChatScreenProcessingInfo.svelte | 16 ++++- .../ChatScreen/ChatScreenServerError.svelte | 34 +++++++++ tools/ui/src/lib/components/app/chat/index.ts | 7 ++ tools/ui/src/routes/+layout.svelte | 2 +- 7 files changed, 116 insertions(+), 52 deletions(-) create mode 100644 tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenGreeting.svelte create mode 100644 tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenServerError.svelte diff --git a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte index bd93a569c..e733a64a9 100644 --- a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte @@ -1,8 +1,7 @@ + + diff --git a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenProcessingInfo.svelte b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenProcessingInfo.svelte index b5979db13..f38f3519c 100644 --- a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenProcessingInfo.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenProcessingInfo.svelte @@ -6,6 +6,7 @@ import { activeMessages, activeConversation } from '$lib/stores/conversations.svelte'; import { config } from '$lib/stores/settings.svelte'; import { getProcessingInfoContext } from '$lib/contexts'; + import { page } from '$app/state'; const processingState = useProcessingState(); const processingInfoCtx = getProcessingInfoContext(); @@ -16,6 +17,14 @@ let isStreaming = $derived(isChatStreaming()); let processingDetails = $derived(processingState.getTechnicalDetails()); + let processingVisible = $derived(processingDetails.length > 0); + + let { onVisibilityChange }: { onVisibilityChange?: (visible: boolean) => void } = $props(); + + $effect(() => { + onVisibilityChange?.(processingVisible); + }); + $effect(() => { const conversation = activeConversation(); @@ -60,9 +69,12 @@
-
+
{#each processingDetails as detail (detail)} {detail} {/each} diff --git a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenServerError.svelte b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenServerError.svelte new file mode 100644 index 000000000..2a998dbeb --- /dev/null +++ b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenServerError.svelte @@ -0,0 +1,34 @@ + + +{#if hasError} +
+ + + + + Server unavailable + + + + + {serverError()} + +
+{/if} diff --git a/tools/ui/src/lib/components/app/chat/index.ts b/tools/ui/src/lib/components/app/chat/index.ts index 9c7ce864e..be5535960 100644 --- a/tools/ui/src/lib/components/app/chat/index.ts +++ b/tools/ui/src/lib/components/app/chat/index.ts @@ -674,3 +674,10 @@ export { default as ChatScreenProcessingInfo } from './ChatScreen/ChatScreenProc * Takes the chat container element as a prop to manage scroll state internally. */ export { default as ChatScreenActionScrollDown } from './ChatScreen/ChatScreenActionScrollDown.svelte'; + +/** + * Server error alert displayed when the server is unreachable. + * Shows the error message with a retry button. + * Rendered inside ChatScreen when `serverError` store has a value. + */ +export { default as ChatScreenServerError } from './ChatScreen/ChatScreenServerError.svelte'; diff --git a/tools/ui/src/routes/+layout.svelte b/tools/ui/src/routes/+layout.svelte index 78227df3c..0610b07ae 100644 --- a/tools/ui/src/routes/+layout.svelte +++ b/tools/ui/src/routes/+layout.svelte @@ -240,7 +240,7 @@ /> -
+
From 17d22a35b268fe997f0f9551d6e39e576bada7fa Mon Sep 17 00:00:00 2001 From: Aparna M P Date: Wed, 20 May 2026 02:40:13 +0530 Subject: [PATCH 28/33] hexagon: add MROPE and IMROPE support in HTP rope op (#23317) --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 2 +- ggml/src/ggml-hexagon/htp/rope-ops.c | 123 ++++++++++++++++++++----- 2 files changed, 102 insertions(+), 23 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index ebeef3bdb..080fb7f47 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2661,7 +2661,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess int mode = op_params[2]; - if ((mode & GGML_ROPE_TYPE_MROPE) || (mode & GGML_ROPE_TYPE_VISION)) { + if (mode == GGML_ROPE_TYPE_VISION) { return false; } if (mode & 1) { diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c index 1d8b0796b..9901453e9 100644 --- a/ggml/src/ggml-hexagon/htp/rope-ops.c +++ b/ggml/src/ggml-hexagon/htp/rope-ops.c @@ -18,9 +18,11 @@ #include "htp-ops.h" #include "htp-ops.h" -// Redefined the types GGML_ROPE_TYPE_NORMAL & GGML_ROPE_TYPE_NEOX as we can't include ggml.h +// Redefined the rope type constants as we can't include ggml.h #define HTP_ROPE_TYPE_NORMAL 0 #define HTP_ROPE_TYPE_NEOX 2 +#define HTP_ROPE_TYPE_MROPE 8 +#define HTP_ROPE_TYPE_IMROPE 40 #define HTP_ROPE_SPAD_NROWS 16 #define HTP_ROPE_SPAD_BLOCK (HTP_ROPE_SPAD_NROWS/2) @@ -82,6 +84,29 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) { return (1 - MIN(1, MAX(0, y))); } +// Compute one (cos, sin) pair into cache[i0], cache[i0+1] applying YaRN scaling. +static inline void rope_yarn_one(float theta, float freq_scale, float * corr_dims, + uint32_t i0, float ext_factor, float mscale, + float * cache) { + float theta_extrap = theta; + + // Get n-d rotational scaling corrected for extrapolation + float theta_interp = freq_scale * theta_extrap; + float theta_final = theta_interp; + float mscale_final = mscale; + + if (ext_factor != 0.0f) { + float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; + theta_final = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + + // Get n-d magnitude scaling corrected for interpolation + mscale_final *= 1.0f + 0.1f * logf(1.0f / freq_scale); + } + + cache[i0 + 0] = cosf(theta_final) * mscale_final; + cache[i0 + 1] = sinf(theta_final) * mscale_final; +} + static void rope_cache_init(const float theta_base, const float freq_scale, const float * freq_factors, @@ -96,29 +121,65 @@ static void rope_cache_init(const float theta_base, for (uint32_t i0 = 0; i0 < ne0; i0 += 2) { const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f; - - float theta_extrap = theta / ff; - - // Get n-d rotational scaling corrected for extrapolation - float theta_interp = freq_scale * theta_extrap; - float theta_final = theta_interp; - float mscale_final = mscale; - - if (ext_factor != 0.0f) { - float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; - theta_final = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; - - // Get n-d magnitude scaling corrected for interpolation - mscale_final *= 1.0f + 0.1f * logf(1.0f / freq_scale); - } - - cache[i0 + 0] = cosf(theta_final) * mscale_final; - cache[i0 + 1] = sinf(theta_final) * mscale_final; + rope_yarn_one(theta / ff, freq_scale, corr_dims, i0, ext_factor, mscale, cache); theta *= theta_scale; } } +// pos_t/h/w/e: the four position ids for this sequence step (t=time, h=height, w=width, e=extra). +// sections[4]: number of head dims assigned to each position component. +static void mrope_cache_init(const float pos_t, + const float pos_h, + const float pos_w, + const float pos_e, + const int32_t sections[4], + const bool is_imrope, + const float freq_scale, + const float * freq_factors, + float * corr_dims, + const uint32_t ne0, + const float ext_factor, + const float mscale, + float * cache, + const float theta_scale) { + const int sect_dims = sections[0] + sections[1] + sections[2] + sections[3]; + const int sec_w = sections[0] + sections[1]; + const int sec_e = sec_w + sections[2]; + + float theta_t = pos_t; + float theta_h = pos_h; + float theta_w = pos_w; + float theta_e = pos_e; + + for (uint32_t i0 = 0; i0 < ne0; i0 += 2) { + const float ff = freq_factors ? freq_factors[i0 / 2] : 1.0f; + const int sector = (i0 / 2) % sect_dims; + + float theta; + if (is_imrope) { + // Interleaved: sector mod 3 selects component + if (sector % 3 == 0 && sector < 3 * sections[0]) { theta = theta_t; } + else if (sector % 3 == 1 && sector < 3 * sections[1]) { theta = theta_h; } + else if (sector % 3 == 2 && sector < 3 * sections[2]) { theta = theta_w; } + else { theta = theta_e; } + } else { + // Contiguous sections + if (sector < sections[0]) { theta = theta_t; } + else if (sector < sec_w) { theta = theta_h; } + else if (sector < sec_e) { theta = theta_w; } + else { theta = theta_e; } + } + + rope_yarn_one(theta / ff, freq_scale, corr_dims, i0, ext_factor, mscale, cache); + + theta_t *= theta_scale; + theta_h *= theta_scale; + theta_w *= theta_scale; + theta_e *= theta_scale; + } +} + #define M_PI 3.1415926535897932384626433 static void rope_corr_dims(int n_dims, @@ -274,7 +335,8 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) { uint64_t tt = HAP_perf_get_qtimer_count(); const int32_t mode = rctx->mode; - const bool is_neox = mode & HTP_ROPE_TYPE_NEOX; + // MROPE and IMROPE use NEOX-style pairing for the rotation + const bool is_neox = (mode & HTP_ROPE_TYPE_NEOX) || (mode & HTP_ROPE_TYPE_MROPE); // VTCM setup uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread); @@ -326,8 +388,25 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) { if (i2 != prev_i2) { prev_i2 = i2; - const int32_t p = pos[i2]; - rope_cache_init(p, rctx->freq_scale, freq_factors, rctx->corr_dims, ne0, rctx->ext_factor, rctx->attn_factor, theta_cache, rctx->theta_scale); + const bool is_mrope = (rctx->mode & HTP_ROPE_TYPE_MROPE) != 0; + if (is_mrope) { + // src1 holds four position arrays stacked along ne0: + // pos[i2], pos[i2+ne2], pos[i2+ne2*2], pos[i2+ne2*3] + const bool is_imrope = (rctx->mode == HTP_ROPE_TYPE_IMROPE); + mrope_cache_init( + (float) pos[i2], + (float) pos[i2 + ne2], + (float) pos[i2 + ne2 * 2], + (float) pos[i2 + ne2 * 3], + rctx->sections, is_imrope, + rctx->freq_scale, freq_factors, rctx->corr_dims, + ne0, rctx->ext_factor, rctx->attn_factor, + theta_cache, rctx->theta_scale); + } else { + rope_cache_init(pos[i2], rctx->freq_scale, freq_factors, rctx->corr_dims, + ne0, rctx->ext_factor, rctx->attn_factor, + theta_cache, rctx->theta_scale); + } // FARF(HIGH, "rope-theta %u: ir %u i1 %u i2 %u i3 %u cache %p : usec %u", ith, ir, i1, i2, i3, theta_cache, // (unsigned) HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - rctx->t_start)); From b28a2f372a4a470a90ad10f93654e5dc33e78949 Mon Sep 17 00:00:00 2001 From: shaofeiqi Date: Tue, 19 May 2026 14:29:00 -0700 Subject: [PATCH 29/33] opencl: add MoE support for q4_k, q5_k, q6_k on Adreno (#23303) * opencl: add q4_k moe support * opencl: add q5_k moe support * opencl: add q6_k moe support * opencl: adjust format --------- Co-authored-by: Li He --- ggml/src/ggml-opencl/CMakeLists.txt | 6 + ggml/src/ggml-opencl/ggml-opencl.cpp | 946 +++++++++++++++++- ggml/src/ggml-opencl/kernels/cvt.cl | 385 +++++++ .../kernels/gemm_moe_q4_k_f32_ns.cl | 279 ++++++ .../kernels/gemm_moe_q5_k_f32_ns.cl | 284 ++++++ .../kernels/gemm_moe_q6_k_f32_ns.cl | 263 +++++ .../kernels/gemv_moe_q4_k_f32_ns.cl | 151 +++ .../kernels/gemv_moe_q5_k_f32_ns.cl | 156 +++ .../kernels/gemv_moe_q6_k_f32_ns.cl | 137 +++ 9 files changed, 2600 insertions(+), 7 deletions(-) create mode 100644 ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl create mode 100644 ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl create mode 100644 ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl create mode 100644 ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl create mode 100644 ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl create mode 100644 ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt index c6aba6087..f75d089b5 100644 --- a/ggml/src/ggml-opencl/CMakeLists.txt +++ b/ggml/src/ggml-opencl/CMakeLists.txt @@ -110,6 +110,12 @@ set(GGML_OPENCL_KERNELS gemv_moe_q5_0_f32_ns gemm_moe_q5_1_f32_ns gemv_moe_q5_1_f32_ns + gemm_moe_q4_k_f32_ns + gemv_moe_q4_k_f32_ns + gemm_moe_q5_k_f32_ns + gemv_moe_q5_k_f32_ns + gemm_moe_q6_k_f32_ns + gemv_moe_q6_k_f32_ns gemm_moe_mxfp4_f32 gemv_moe_mxfp4_f32 gemm_moe_mxfp4_f32_ns diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 0e511592d..a3af8c2da 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -558,6 +558,9 @@ struct ggml_backend_opencl_context { cl_kernel kernel_convert_block_q4_1_trans4_ns, kernel_restore_block_q4_1_trans4_ns; cl_kernel kernel_convert_block_q5_0_trans4_ns, kernel_restore_block_q5_0_trans4_ns; cl_kernel kernel_convert_block_q5_1_trans4_ns, kernel_restore_block_q5_1_trans4_ns; + cl_kernel kernel_convert_block_q4_k_trans4_ns, kernel_restore_block_q4_k_trans4_ns; + cl_kernel kernel_convert_block_q5_k_trans4_ns, kernel_restore_block_q5_k_trans4_ns; + cl_kernel kernel_convert_block_q6_k_trans4_ns, kernel_restore_block_q6_k_trans4_ns; cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans; cl_kernel kernel_convert_block_mxfp4_trans4_ns, kernel_restore_block_mxfp4_trans4_ns; cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0, kernel_restore_block_q8_0_trans; @@ -619,6 +622,9 @@ struct ggml_backend_opencl_context { cl_kernel kernel_gemv_moe_q4_1_f32_ns, kernel_gemm_moe_q4_1_f32_ns; cl_kernel kernel_gemv_moe_q5_0_f32_ns, kernel_gemm_moe_q5_0_f32_ns; cl_kernel kernel_gemv_moe_q5_1_f32_ns, kernel_gemm_moe_q5_1_f32_ns; + cl_kernel kernel_gemv_moe_q4_k_f32_ns, kernel_gemm_moe_q4_k_f32_ns; + cl_kernel kernel_gemv_moe_q5_k_f32_ns, kernel_gemm_moe_q5_k_f32_ns; + cl_kernel kernel_gemv_moe_q6_k_f32_ns, kernel_gemm_moe_q6_k_f32_ns; cl_kernel kernel_gemv_moe_mxfp4_f32, kernel_gemm_moe_mxfp4_f32; cl_kernel kernel_gemv_moe_mxfp4_f32_ns, kernel_gemm_moe_mxfp4_f32_ns; cl_kernel kernel_moe_reorder_b; @@ -981,6 +987,12 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve CL_CHECK((backend_ctx->kernel_restore_block_q5_0_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_0_trans4_ns", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_q5_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_1_trans4_ns", &err), err)); CL_CHECK((backend_ctx->kernel_restore_block_q5_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_1_trans4_ns", &err), err)); + CL_CHECK((backend_ctx->kernel_convert_block_q4_k_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_k_trans4_ns", &err), err)); + CL_CHECK((backend_ctx->kernel_restore_block_q4_k_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_k_trans4_ns", &err), err)); + CL_CHECK((backend_ctx->kernel_convert_block_q5_k_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_k_trans4_ns", &err), err)); + CL_CHECK((backend_ctx->kernel_restore_block_q5_k_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_k_trans4_ns", &err), err)); + CL_CHECK((backend_ctx->kernel_convert_block_q6_k_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q6_k_trans4_ns", &err), err)); + CL_CHECK((backend_ctx->kernel_restore_block_q6_k_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q6_k_trans4_ns", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_mxfp4_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4_trans", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_mxfp4_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4_trans4_ns", &err), err)); @@ -3071,6 +3083,108 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve GGML_LOG_CONT("."); } + // gemv_moe_q4_k_f32_ns + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemv_moe_q4_k_f32_ns.cl.h" + }; +#else + const std::string kernel_src = read_file("gemv_moe_q4_k_f32_ns.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts); + + CL_CHECK((backend_ctx->kernel_gemv_moe_q4_k_f32_ns = clCreateKernel(prog, "kernel_gemv_moe_q4_k_f32_ns", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + + // gemm_moe_q4_k_f32_ns + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemm_moe_q4_k_f32_ns.cl.h" + }; +#else + const std::string kernel_src = read_file("gemm_moe_q4_k_f32_ns.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts); + + CL_CHECK((backend_ctx->kernel_gemm_moe_q4_k_f32_ns = clCreateKernel(prog, "kernel_gemm_moe_q4_k_f32_ns", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + + // gemv_moe_q5_k_f32_ns + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemv_moe_q5_k_f32_ns.cl.h" + }; +#else + const std::string kernel_src = read_file("gemv_moe_q5_k_f32_ns.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts); + + CL_CHECK((backend_ctx->kernel_gemv_moe_q5_k_f32_ns = clCreateKernel(prog, "kernel_gemv_moe_q5_k_f32_ns", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + + // gemm_moe_q5_k_f32_ns + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemm_moe_q5_k_f32_ns.cl.h" + }; +#else + const std::string kernel_src = read_file("gemm_moe_q5_k_f32_ns.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts); + + CL_CHECK((backend_ctx->kernel_gemm_moe_q5_k_f32_ns = clCreateKernel(prog, "kernel_gemm_moe_q5_k_f32_ns", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + + // gemv_moe_q6_k_f32_ns + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemv_moe_q6_k_f32_ns.cl.h" + }; +#else + const std::string kernel_src = read_file("gemv_moe_q6_k_f32_ns.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts); + + CL_CHECK((backend_ctx->kernel_gemv_moe_q6_k_f32_ns = clCreateKernel(prog, "kernel_gemv_moe_q6_k_f32_ns", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + + // gemm_moe_q6_k_f32_ns + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemm_moe_q6_k_f32_ns.cl.h" + }; +#else + const std::string kernel_src = read_file("gemm_moe_q6_k_f32_ns.cl"); +#endif + cl_program prog = + build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts); + + CL_CHECK((backend_ctx->kernel_gemm_moe_q6_k_f32_ns = clCreateKernel(prog, "kernel_gemm_moe_q6_k_f32_ns", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + // gemv_moe_mxfp4_f32_ns { #ifdef GGML_OPENCL_EMBED_KERNELS @@ -4148,6 +4262,8 @@ struct ggml_tensor_extra_cl_iq4_nl { struct ggml_tensor_extra_cl_q4_K { // Quantized values cl_mem q = nullptr; + // Quantized values in image1d_buffer_t. + cl_mem q_img = nullptr; // Scales for each super block. cl_mem s = nullptr; // Scales @@ -4176,12 +4292,18 @@ struct ggml_tensor_extra_cl_q4_K { CL_CHECK(clReleaseMemObject(dm)); dm = nullptr; } + if (q_img != nullptr) { + CL_CHECK(clReleaseMemObject(q_img)); + q_img = nullptr; + } } }; struct ggml_tensor_extra_cl_q5_K { // Lower 4 bits of quantized weights. cl_mem q = nullptr; + // Quantized values in image1d_buffer_t. + cl_mem q_img = nullptr; // Upper 1 bit of quantized weights. cl_mem qh = nullptr; // Scales for each block. @@ -4222,6 +4344,10 @@ struct ggml_tensor_extra_cl_q5_K { CL_CHECK(clReleaseMemObject(dm)); dm = nullptr; } + if (q_img != nullptr) { + CL_CHECK(clReleaseMemObject(q_img)); + q_img = nullptr; + } size_q = 0; size_qh = 0; @@ -4234,6 +4360,8 @@ struct ggml_tensor_extra_cl_q5_K { struct ggml_tensor_extra_cl_q6_K { // Lower 4 bits of quantized weights. cl_mem ql = nullptr; + // Lower 4 bits as image1d_buffer_t + cl_mem ql_img = nullptr; // Upper 2 bits of quantized weights. cl_mem qh = nullptr; // Scales for each block. @@ -4267,6 +4395,10 @@ struct ggml_tensor_extra_cl_q6_K { CL_CHECK(clReleaseMemObject(d)); d = nullptr; } + if (ql_img != nullptr) { + CL_CHECK(clReleaseMemObject(ql_img)); + ql_img = nullptr; + } size_ql = 0; size_qh = 0; @@ -4700,7 +4832,10 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te // the quantizations here currently do not - they are only supported by Adreno with certain shapes if (op->src[0]->type == GGML_TYPE_Q4_1 || op->src[0]->type == GGML_TYPE_Q5_0 || - op->src[0]->type == GGML_TYPE_Q5_1) { + op->src[0]->type == GGML_TYPE_Q5_1 || + op->src[0]->type == GGML_TYPE_Q4_K || + op->src[0]->type == GGML_TYPE_Q5_K || + op->src[0]->type == GGML_TYPE_Q6_K) { #ifdef GGML_OPENCL_USE_ADRENO_KERNELS if (op->src[1]->type == GGML_TYPE_F32) { return use_adreno_moe_kernels(backend_ctx, op->src[0]) @@ -6047,14 +6182,57 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); CL_CHECK(err); - #ifdef GGML_OPENCL_USE_ADRENO_KERNELS +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, tensor)) { + cl_kernel kernel = backend_ctx->kernel_convert_block_q4_k_trans4_ns; + + int ne00 = tensor->ne[0]; + int ne01 = tensor->ne[1]; + int ne02 = tensor->ne[2]; + + cl_uchar mask_0F = 0x0F; + cl_uchar mask_F0 = 0xF0; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->dm)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->s)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_uchar), &mask_0F)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_uchar), &mask_F0)); + + size_t global_work_size[] = {static_cast(((ne01 + 63) / 64) * 64), static_cast(ne00 / 256), static_cast(ne02)}; + size_t local_work_size[] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clReleaseMemObject(data_device)); + + cl_image_format img_format_q = {CL_R, CL_UNSIGNED_INT32}; + cl_image_desc img_desc_q = { + CL_MEM_OBJECT_IMAGE1D_BUFFER, + static_cast(ggml_nelements(tensor) / 8), + 0, 0, 0, 0, 0, 0, 0, + { extra->q } + }; + extra->q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL, &err); + CL_CHECK(err); + tensor->extra = extra; + + return; + } +#endif // GGML_OPENCL_USE_ADRENO_KERNELS + +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K; if (use_adreno_kernels(backend_ctx, tensor)) { kernel = backend_ctx->kernel_convert_block_q4_K_noshuffle; } - #else +#else cl_kernel kernel = backend_ctx->kernel_convert_block_q4_K; - #endif +#endif // GGML_OPENCL_USE_ADRENO_KERNELS cl_uchar mask_0F = 0x0F; cl_uchar mask_F0 = 0xF0; @@ -6157,14 +6335,58 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, CL_CHECK((extra->qh = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); CL_CHECK(err); - #ifdef GGML_OPENCL_USE_ADRENO_KERNELS +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, tensor)) { + cl_kernel kernel = backend_ctx->kernel_convert_block_q5_k_trans4_ns; + + int ne00 = tensor->ne[0]; + int ne01 = tensor->ne[1]; + int ne02 = tensor->ne[2]; + + cl_uchar mask_0F = 0x0F; + cl_uchar mask_F0 = 0xF0; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->dm)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extra->s)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_uchar), &mask_0F)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_uchar), &mask_F0)); + + size_t global_work_size[] = {static_cast(((ne01 + 63) / 64) * 64), static_cast(ne00 / 256), static_cast(ne02)}; + size_t local_work_size[] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clReleaseMemObject(data_device)); + + cl_image_format img_format_q = {CL_R, CL_UNSIGNED_INT32}; + cl_image_desc img_desc_q = { + CL_MEM_OBJECT_IMAGE1D_BUFFER, + static_cast(ggml_nelements(tensor) / 8), + 0, 0, 0, 0, 0, 0, 0, + { extra->q } + }; + extra->q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL, &err); + CL_CHECK(err); + tensor->extra = extra; + + return; + } +#endif // GGML_OPENCL_USE_ADRENO_KERNELS + +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS cl_kernel kernel = backend_ctx->kernel_convert_block_q5_K; if (use_adreno_kernels(backend_ctx, tensor)) { kernel = backend_ctx->kernel_convert_block_q5_K_noshuffle; } - #else +#else cl_kernel kernel = backend_ctx->kernel_convert_block_q5_K; - #endif +#endif cl_uchar mask_0F = 0x0F; cl_uchar mask_F0 = 0xF0; @@ -6232,6 +6454,79 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, cl_buffer_region region; + cl_uchar mask_0F = 0x0F; + cl_uchar mask_F0 = 0xF0; + +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + // Adreno MoE Q6_K kernel needs special transposed layout + if (use_adreno_moe_kernels(backend_ctx, tensor)) { + size_t moe_size_ql = (size_t)(ggml_nelements(tensor) / 8) * sizeof(uint32_t); // 4 bits per element + size_t moe_size_qh = (size_t)(ggml_nelements(tensor) / 16) * sizeof(uint32_t); // 2 bits per element + size_t moe_size_s = size_s; + size_t moe_size_d = size_d; + + // Subbuffer for ql + region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment); + region.size = moe_size_ql; + CL_CHECK((extra->ql = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + auto previous_origin = region.origin; + + // Subbuffer for qh + region.origin = align_to(previous_origin + moe_size_ql, backend_ctx->alignment); + region.size = moe_size_qh; + CL_CHECK((extra->qh = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + previous_origin = region.origin; + + // Subbuffer for scales + region.origin = align_to(previous_origin + moe_size_qh, backend_ctx->alignment); + region.size = moe_size_s; + CL_CHECK((extra->s = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + previous_origin = region.origin; + + // Subbuffer for d + region.origin = align_to(previous_origin + moe_size_s, backend_ctx->alignment); + region.size = moe_size_d; + CL_CHECK((extra->d = clCreateSubBuffer(extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + + cl_kernel kernel = backend_ctx->kernel_convert_block_q6_k_trans4_ns; + + int ne00 = tensor->ne[0]; + int ne01 = tensor->ne[1]; + int ne02 = tensor->ne[2]; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->ql)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->s)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_uchar), &mask_0F)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_uchar), &mask_F0)); + + size_t global_work_size[] = {static_cast(((ne01 + 63) / 64) * 64), static_cast(ne00 / 256), static_cast(ne02)}; + size_t local_work_size[] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clReleaseMemObject(data_device)); + + // Create image for ql + cl_image_format img_format_ql = {CL_R, CL_UNSIGNED_INT32}; + cl_image_desc img_desc_ql = { + CL_MEM_OBJECT_IMAGE1D_BUFFER, + static_cast(ggml_nelements(tensor) / 8), + 0, 0, 0, 0, 0, 0, 0, + { extra->ql } + }; + extra->ql_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_ql, &img_desc_ql, NULL, &err); + tensor->extra = extra; + + return; + } +#endif // GGML_OPENCL_USE_ADRENO_KERNELS + // Subbuffer for ql region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment); region.size = size_ql; @@ -6825,6 +7120,40 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, cl_uchar mask_F0 = 0xF0; #ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, tensor)) { + cl_int err; + cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, + ggml_nbytes(tensor), NULL, &err); + CL_CHECK(err); + + cl_kernel kernel = backend_ctx->kernel_restore_block_q4_k_trans4_ns; + + int ne00 = tensor->ne[0]; + int ne01 = tensor->ne[1]; + int ne02 = tensor->ne[2]; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->dm)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->s)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_uchar), &mask_0F)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_uchar), &mask_F0)); + + size_t global_work_size[] = {static_cast(((ne01 + 63) / 64) * 64), static_cast(ne00 / 256), static_cast(ne02)}; + size_t local_work_size[] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, + global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clEnqueueReadBuffer( + queue, data_device, CL_TRUE, offset, + size, data, 0, NULL, NULL)); + CL_CHECK(clReleaseMemObject(data_device)); + return; + } if (use_adreno_kernels(backend_ctx, tensor)) { int M = tensor->ne[1]; int K = tensor->ne[0]; @@ -6901,6 +7230,40 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, cl_uchar mask_F0 = 0xF0; #ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, tensor)) { + cl_int err; + cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, + ggml_nbytes(tensor), NULL, &err); + CL_CHECK(err); + cl_kernel kernel = backend_ctx->kernel_restore_block_q5_k_trans4_ns; + + int ne00 = tensor->ne[0]; + int ne01 = tensor->ne[1]; + int ne02 = tensor->ne[2]; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->dm)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->s)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_uchar), &mask_0F)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_uchar), &mask_F0)); + + size_t global_work_size[] = {static_cast(((ne01 + 63) / 64) * 64), static_cast(ne00 / 256), static_cast(ne02)}; + size_t local_work_size[] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, + global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clEnqueueReadBuffer( + queue, data_device, CL_TRUE, offset, + size, data, 0, NULL, NULL)); + CL_CHECK(clReleaseMemObject(data_device)); + return; + } if (use_adreno_kernels(backend_ctx, tensor)) { int M = tensor->ne[1]; int K = tensor->ne[0]; @@ -6974,7 +7337,44 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, if (tensor->type == GGML_TYPE_Q6_K) { ggml_tensor_extra_cl_q6_K * extra = (ggml_tensor_extra_cl_q6_K *)tensor->extra; + cl_uchar mask_0F = 0x0F; + cl_uchar mask_F0 = 0xF0; + #ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, tensor)) { + cl_int err; + cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, + ggml_nbytes(tensor), NULL, &err); + CL_CHECK(err); + + cl_kernel kernel = backend_ctx->kernel_restore_block_q6_k_trans4_ns; + + int ne00 = tensor->ne[0]; + int ne01 = tensor->ne[1]; + int ne02 = tensor->ne[2]; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->ql)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qh)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->s)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_uchar), &mask_0F)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_uchar), &mask_F0)); + + size_t global_work_size[] = {static_cast(((ne01 + 63) / 64) * 64), static_cast(ne00 / 256), static_cast(ne02)}; + size_t local_work_size[] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, + global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clEnqueueReadBuffer( + queue, data_device, CL_TRUE, offset, + size, data, 0, NULL, NULL)); + CL_CHECK(clReleaseMemObject(data_device)); + return; + } if (use_adreno_kernels(backend_ctx, tensor)) { static ggml_cl_buffer buf_trans_ql; static ggml_cl_buffer buf_trans_qh; @@ -13733,6 +14133,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor_extra_cl_q4_1 * extra0_q4_1 = (ggml_tensor_extra_cl_q4_1 *)src0->extra; ggml_tensor_extra_cl_q5_0 * extra0_q5_0 = (ggml_tensor_extra_cl_q5_0 *)src0->extra; ggml_tensor_extra_cl_q5_1 * extra0_q5_1 = (ggml_tensor_extra_cl_q5_1 *)src0->extra; + ggml_tensor_extra_cl_q4_K * extra0_q4_K = (ggml_tensor_extra_cl_q4_K *)src0->extra; + ggml_tensor_extra_cl_q5_K * extra0_q5_K = (ggml_tensor_extra_cl_q5_K *)src0->extra; + ggml_tensor_extra_cl_q6_K * extra0_q6_K = (ggml_tensor_extra_cl_q6_K *)src0->extra; ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra; ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra; #endif @@ -13741,6 +14144,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, (void)extra0_q4_1; (void)extra0_q5_0; (void)extra0_q5_1; + (void)extra0_q4_K; + (void)extra0_q5_K; + (void)extra0_q6_K; const int ne00 = src0->ne[0]; const int ne01 = src0->ne[1]; @@ -14612,6 +15018,532 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, #endif // GGML_OPENCL_SOA_Q break; } + case GGML_TYPE_Q4_K: { +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, src0)) { + cl_int status; + + size_t local_size[3] = {64, 2, 1}; + size_t global_size[3] = {64, 2, 1}; + + if (ne12 == 1) { // for gemv + kernel = backend_ctx->kernel_gemv_moe_q4_k_f32_ns; + + cl_mem src1_sub_buffer, buf_src1_image, buf_src2; + + // create a sub_buffer for src2 + cl_buffer_region region; + region.origin = offset2; + region.size = ne20 * ne21 * sizeof(int); + buf_src2 = clCreateSubBuffer(extra2->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // set thread grid + global_size[0] = static_cast(ne01); + global_size[1] = 4; + global_size[2] = static_cast(ne20); + local_size[1] = 4; + + // create a sub_buffer for src1 + region.origin = offset1; + region.size = ne10 * ne11 * ne12 * sizeof(float); + src1_sub_buffer = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // create image for src1 + cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; + cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne10 * ne11 * ne12 / 4), 0,0,0,0,0,0,0, {src1_sub_buffer}}; + buf_src1_image = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status); + CL_CHECK(status); + + // Set kernel args + int arg_idx = 0; + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_K->q)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_K->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_K->dm)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_K->s)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src1_image)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne11)); + + // launch kernel + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst); + + // deallocate sub buffers and images + CL_CHECK(clReleaseMemObject(src1_sub_buffer)); + CL_CHECK(clReleaseMemObject(buf_src1_image)); + CL_CHECK(clReleaseMemObject(buf_src2)); + + } else { // for gemm + kernel = backend_ctx->kernel_gemm_moe_q4_k_f32_ns; + + // Reorder router if called from test-backend-ops or when new router is generated. + // Otherwise reuse the reordered result from previous mul_mat_id call. + if ((strstr(src0->name, "as") != NULL) || backend_ctx->toggle_reorder) { + moe_router_reoerder(backend, src2, ne20); + backend_ctx->toggle_reorder = false; + } + + cl_mem sub_buf_src1_pre, buf_src1_reordered, image_src1_reordered, sub_buf_dst, buf_dst_image; + cl_mem buf_src2, buf_src2_emap; + + cl_buffer_region region; + region.origin = 0; + region.size = sizeof(int) * max_post_router_tile * n_tile_size; + buf_src2 = clCreateSubBuffer(backend_ctx->prealloc_post_router.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + region.origin = 0; + region.size = sizeof(short) * max_post_router_tile; + buf_src2_emap = clCreateSubBuffer(backend_ctx->prealloc_emap.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // Reorder activations + region.origin = offset1; + region.size = ne10 * ne11 * ne12 * sizeof(float); + sub_buf_src1_pre = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // Create image for reordered src1 + region.origin = 0; + region.size = ne00 * max_post_router_tile * n_tile_size * sizeof(float); + backend_ctx->prealloc_act_trans.allocate(backend_ctx->context, region.size); + buf_src1_reordered = clCreateSubBuffer( + backend_ctx->prealloc_act_trans.buffer, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; + cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}}; + image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status); + CL_CHECK(status); + + unsigned short map_ratio = ne20 / ne11; + GGML_ASSERT(((map_ratio == 1) || (map_ratio == ne20)) && "Map ratio not supported\n"); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 0, sizeof(cl_mem), &sub_buf_src1_pre)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 1, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 2, sizeof(cl_mem), &buf_src1_reordered)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 3, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer))); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 4, sizeof(unsigned int), &ne00)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 5, sizeof(unsigned short), &map_ratio)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 6, sizeof(unsigned int), &n_tile_size)); + + size_t reorder_b_local_size[3] = {256, 1, 1}; + size_t reorder_b_global_size[3] = {static_cast(((ne00 / 4) + 255) / 256 * 256), static_cast(max_post_router_tile * n_tile_size), 1}; + + // Dispatch reorder kernel + backend_ctx->enqueue_ndrange_kernel(backend_ctx->kernel_moe_reorder_b, 3, reorder_b_global_size, reorder_b_local_size, dst); + + // MoE kernel prepare + region.origin = offsetd; + region.size = ne0 * ne1 * ne2 * sizeof(float); + sub_buf_dst = clCreateSubBuffer( + extrad->data_device, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + // Create image for dst + cl_image_format image_format_buf_dst = {CL_R, CL_FLOAT}; + cl_image_desc image_desc_buf_dst = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne0 * ne1 * ne2), 0,0,0,0,0,0,0, {sub_buf_dst}}; + buf_dst_image = clCreateImage(backend_ctx->context, CL_MEM_WRITE_ONLY, &image_format_buf_dst, &image_desc_buf_dst, NULL, &status); + CL_CHECK(status); + + // Set kernel args + int arg_idx = 0; + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_K->q_img)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_K->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_K->dm)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q4_K->s)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &image_src1_reordered)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2_emap)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_dst_image)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer))); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01)); + + // set thread grid + global_size[1] = static_cast((ne01 + 63) / 64); + global_size[2] = static_cast(max_post_router_tile); + local_size[1] = 1; + local_size[2] = 1; + + // Dispatch kernel + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst); + + clReleaseMemObject(sub_buf_src1_pre); + clReleaseMemObject(buf_src1_reordered); + clReleaseMemObject(image_src1_reordered); + clReleaseMemObject(buf_src2); + clReleaseMemObject(buf_src2_emap); + clReleaseMemObject(sub_buf_dst); + clReleaseMemObject(buf_dst_image); + } + return; + } +#endif //GGML_OPENCL_USE_ADRENO_KERNELS + } + case GGML_TYPE_Q5_K: { +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, src0)) { + cl_int status; + + size_t local_size[3] = {64, 2, 1}; + size_t global_size[3] = {64, 2, 1}; + + if (ne12 == 1) { // for gemv + kernel = backend_ctx->kernel_gemv_moe_q5_k_f32_ns; + + cl_mem src1_sub_buffer, buf_src1_image, buf_src2; + + // create a sub_buffer for src2 + cl_buffer_region region; + region.origin = offset2; + region.size = ne20 * ne21 * sizeof(int); + buf_src2 = clCreateSubBuffer(extra2->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // set thread grid + global_size[0] = static_cast(ne01); + global_size[1] = 4; + global_size[2] = static_cast(ne20); + local_size[1] = 4; + + // create a sub_buffer for src1 + region.origin = offset1; + region.size = ne10 * ne11 * ne12 * sizeof(float); + src1_sub_buffer = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // create image for src1 + cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; + cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne10 * ne11 * ne12 / 4), 0,0,0,0,0,0,0, {src1_sub_buffer}}; + buf_src1_image = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status); + CL_CHECK(status); + + // Set kernel args + int arg_idx = 0; + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->q)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->qh)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->dm)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->s)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src1_image)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne11)); + + // launch kernel + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst); + + // deallocate sub buffers and images + CL_CHECK(clReleaseMemObject(src1_sub_buffer)); + CL_CHECK(clReleaseMemObject(buf_src1_image)); + CL_CHECK(clReleaseMemObject(buf_src2)); + + } else { // for gemm + kernel = backend_ctx->kernel_gemm_moe_q5_k_f32_ns; + + // Reorder router if called from test-backend-ops or when new router is generated. + // Otherwise reuse the reordered result from previous mul_mat_id call. + if ((strstr(src0->name, "as") != NULL) || backend_ctx->toggle_reorder) { + moe_router_reoerder(backend, src2, ne20); + backend_ctx->toggle_reorder = false; + } + + cl_mem sub_buf_src1_pre, buf_src1_reordered, image_src1_reordered, sub_buf_dst, buf_dst_image; + cl_mem buf_src2, buf_src2_emap; + + cl_buffer_region region; + region.origin = 0; + region.size = sizeof(int) * max_post_router_tile * n_tile_size; + buf_src2 = clCreateSubBuffer(backend_ctx->prealloc_post_router.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + region.origin = 0; + region.size = sizeof(short) * max_post_router_tile; + buf_src2_emap = clCreateSubBuffer(backend_ctx->prealloc_emap.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // Reorder activations + // create a sub_buffer for src1 + region.origin = offset1; + region.size = ne10 * ne11 * ne12 * sizeof(float); + sub_buf_src1_pre = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // Create image for reordered src1 + // Use pre-allocated placeholder + region.origin = 0; + region.size = ne00 * max_post_router_tile * n_tile_size * sizeof(float); + backend_ctx->prealloc_act_trans.allocate(backend_ctx->context, region.size); + buf_src1_reordered = clCreateSubBuffer( + backend_ctx->prealloc_act_trans.buffer, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; + cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}}; + image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status); + CL_CHECK(status); + + unsigned short map_ratio = ne20 / ne11; + GGML_ASSERT(((map_ratio == 1) || (map_ratio == ne20)) && "Map ratio not supported\n"); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 0, sizeof(cl_mem), &sub_buf_src1_pre)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 1, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 2, sizeof(cl_mem), &buf_src1_reordered)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 3, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer))); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 4, sizeof(unsigned int), &ne00)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 5, sizeof(unsigned short), &map_ratio)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 6, sizeof(unsigned int), &n_tile_size)); + + size_t reorder_b_local_size[3] = {256, 1, 1}; + size_t reorder_b_global_size[3] = {static_cast(((ne00 / 4) + 255) / 256 * 256), static_cast(max_post_router_tile * n_tile_size), 1}; + + // Dispatch reorder kernel + backend_ctx->enqueue_ndrange_kernel(backend_ctx->kernel_moe_reorder_b, 3, reorder_b_global_size, reorder_b_local_size, dst); + + // MoE kernel prepare + // Create sub buffer for dst + region.origin = offsetd; + region.size = ne0 * ne1 * ne2 * sizeof(float); + sub_buf_dst = clCreateSubBuffer( + extrad->data_device, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + // Create image for dst + cl_image_format image_format_buf_dst = {CL_R, CL_FLOAT}; + cl_image_desc image_desc_buf_dst = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne0 * ne1 * ne2), 0,0,0,0,0,0,0, {sub_buf_dst}}; + buf_dst_image = clCreateImage(backend_ctx->context, CL_MEM_WRITE_ONLY, &image_format_buf_dst, &image_desc_buf_dst, NULL, &status); + CL_CHECK(status); + + // Set kernel args + int arg_idx = 0; + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->q_img)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->qh)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->s)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q5_K->dm)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &image_src1_reordered)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2_emap)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_dst_image)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer))); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01)); + + // set thread grid + global_size[1] = static_cast((ne01 + 63) / 64); + global_size[2] = static_cast(max_post_router_tile); + local_size[1] = 1; + local_size[2] = 1; + + // Dispatch kernel + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst); + + clReleaseMemObject(sub_buf_src1_pre); + clReleaseMemObject(buf_src1_reordered); + clReleaseMemObject(image_src1_reordered); + clReleaseMemObject(buf_src2); + clReleaseMemObject(buf_src2_emap); + clReleaseMemObject(sub_buf_dst); + clReleaseMemObject(buf_dst_image); + } + return; + } +#endif //GGML_OPENCL_USE_ADRENO_KERNELS + } + case GGML_TYPE_Q6_K: { +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_moe_kernels(backend_ctx, src0)) { + cl_int status; + + size_t local_size[3] = {64, 2, 1}; + size_t global_size[3] = {64, 2, 1}; + + if (ne12 == 1) { // for gemv + kernel = backend_ctx->kernel_gemv_moe_q6_k_f32_ns; + + cl_mem src1_sub_buffer, buf_src1_image, buf_src2; + + // create a sub_buffer for src2 + cl_buffer_region region; + region.origin = offset2; + region.size = ne20 * ne21 * sizeof(int); + buf_src2 = clCreateSubBuffer(extra2->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // set thread grid + global_size[0] = static_cast(ne01); + global_size[1] = 4; + global_size[2] = static_cast(ne20); + local_size[1] = 4; + + // create a sub_buffer for src1 + region.origin = offset1; + region.size = ne10 * ne11 * ne12 * sizeof(float); + src1_sub_buffer = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // create image for src1 + cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; + cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne10 * ne11 * ne12 / 4), 0,0,0,0,0,0,0, {src1_sub_buffer}}; + buf_src1_image = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status); + CL_CHECK(status); + + // Set kernel args + int arg_idx = 0; + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q6_K->ql)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q6_K->qh)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q6_K->s)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q6_K->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src1_image)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne11)); + + // launch kernel + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst); + + // deallocate sub buffers and images + CL_CHECK(clReleaseMemObject(src1_sub_buffer)); + CL_CHECK(clReleaseMemObject(buf_src1_image)); + CL_CHECK(clReleaseMemObject(buf_src2)); + + } else { // for gemm + kernel = backend_ctx->kernel_gemm_moe_q6_k_f32_ns; + + // Reorder router if called from test-backend-ops or when new router is generated. + // Otherwise reuse the reordered result from previous mul_mat_id call. + if ((strstr(src0->name, "as") != NULL) || backend_ctx->toggle_reorder) { + moe_router_reoerder(backend, src2, ne20); + backend_ctx->toggle_reorder = false; + } + + cl_mem sub_buf_src1_pre, buf_src1_reordered, image_src1_reordered, sub_buf_dst, buf_dst_image; + cl_mem buf_src2, buf_src2_emap; + + cl_buffer_region region; + region.origin = 0; + region.size = sizeof(int) * max_post_router_tile * n_tile_size; + buf_src2 = clCreateSubBuffer(backend_ctx->prealloc_post_router.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + region.origin = 0; + region.size = sizeof(short) * max_post_router_tile; + buf_src2_emap = clCreateSubBuffer(backend_ctx->prealloc_emap.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // Reorder activations + // create a sub_buffer for src1 + region.origin = offset1; + region.size = ne10 * ne11 * ne12 * sizeof(float); + sub_buf_src1_pre = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); + CL_CHECK(status); + + // Create image for reordered src1 + region.origin = 0; + region.size = ne00 * max_post_router_tile * n_tile_size * sizeof(float); + backend_ctx->prealloc_act_trans.allocate(backend_ctx->context, region.size); + buf_src1_reordered = clCreateSubBuffer( + backend_ctx->prealloc_act_trans.buffer, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT}; + cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne00 * max_post_router_tile * n_tile_size / 4), 0,0,0,0,0,0,0, {buf_src1_reordered}}; + image_src1_reordered = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status); + CL_CHECK(status); + + unsigned short map_ratio = ne20 / ne11; + GGML_ASSERT(((map_ratio == 1) || (map_ratio == ne20)) && "Map ratio not supported\n"); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 0, sizeof(cl_mem), &sub_buf_src1_pre)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 1, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 2, sizeof(cl_mem), &buf_src1_reordered)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 3, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer))); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 4, sizeof(unsigned int), &ne00)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 5, sizeof(unsigned short), &map_ratio)); + CL_CHECK(clSetKernelArg(backend_ctx->kernel_moe_reorder_b, 6, sizeof(unsigned int), &n_tile_size)); + + size_t reorder_b_local_size[3] = {256, 1, 1}; + size_t reorder_b_global_size[3] = {static_cast(((ne00 / 4) + 255) / 256 * 256), static_cast(max_post_router_tile * n_tile_size), 1}; + + // Dispatch reorder kernel + backend_ctx->enqueue_ndrange_kernel(backend_ctx->kernel_moe_reorder_b, 3, reorder_b_global_size, reorder_b_local_size, dst); + + // MoE kernel prepare + // Create sub buffer for dst + region.origin = offsetd; + region.size = ne0 * ne1 * ne2 * sizeof(float); + sub_buf_dst = clCreateSubBuffer( + extrad->data_device, + 0, + CL_BUFFER_CREATE_TYPE_REGION, + ®ion, + &status); + CL_CHECK(status); + // Create image for dst + cl_image_format image_format_buf_dst = {CL_R, CL_FLOAT}; + cl_image_desc image_desc_buf_dst = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast(ne0 * ne1 * ne2), 0,0,0,0,0,0,0, {sub_buf_dst}}; + buf_dst_image = clCreateImage(backend_ctx->context, CL_MEM_WRITE_ONLY, &image_format_buf_dst, &image_desc_buf_dst, NULL, &status); + CL_CHECK(status); + + // Set kernel args + int arg_idx = 0; + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q6_K->ql_img)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q6_K->qh)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q6_K->s)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_q6_K->d)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &image_src1_reordered)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2_emap)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_dst_image)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &(backend_ctx->prealloc_total_tiles.buffer))); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01)); + + // set thread grid + global_size[1] = static_cast((ne01 + 63) / 64); + global_size[2] = static_cast(max_post_router_tile); + local_size[1] = 1; + local_size[2] = 1; + + // Dispatch kernel + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst); + + clReleaseMemObject(sub_buf_src1_pre); + clReleaseMemObject(buf_src1_reordered); + clReleaseMemObject(image_src1_reordered); + clReleaseMemObject(buf_src2); + clReleaseMemObject(buf_src2_emap); + clReleaseMemObject(sub_buf_dst); + clReleaseMemObject(buf_dst_image); + } + return; + } +#endif //GGML_OPENCL_USE_ADRENO_KERNELS + } case GGML_TYPE_MXFP4: { #ifdef GGML_OPENCL_USE_ADRENO_KERNELS if (use_adreno_moe_kernels(backend_ctx, src0)) { diff --git a/ggml/src/ggml-opencl/kernels/cvt.cl b/ggml/src/ggml-opencl/kernels/cvt.cl index 8f06d5705..312366984 100644 --- a/ggml/src/ggml-opencl/kernels/cvt.cl +++ b/ggml/src/ggml-opencl/kernels/cvt.cl @@ -664,6 +664,391 @@ kernel void kernel_restore_block_q5_1_trans4_ns( ((__global ushort8 *)(&(b->qs[0])))[0] = pre_block; } +kernel void kernel_convert_block_q4_k_trans4_ns( + __global struct block_q4_K * src0, + __global uint * dst_q, + __global half * dst_d, + __global half * dst_dm, + __global uchar * dst_s, + uint ne00, + uint ne01, + uchar mask_0F, + uchar mask_F0 +) { + uint i00 = get_global_id(1); + uint i01 = get_global_id(0); + uint i02 = get_global_id(2); + + uint ne00_blk = ne00 / QK_K; + uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; + uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; + + __global struct block_q4_K * b = src0 + src_blk_offset; + + dst_d [dst_blk_offset] = b->d; + dst_dm[dst_blk_offset] = b->dm; + + uint4 qv[8]; + uchar * qv_bytes = (uchar *)qv; + for (int i = 0; i < QK_K / 64; ++i) { + for (int j = 0; j < 16; ++j) { + uchar x0 = b->q[i*32 + 2*j]; + uchar x1 = b->q[i*32 + 2*j + 1]; + + qv_bytes[i*32 + j ] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4); + qv_bytes[i*32 + j + 16] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0); + } + } + + uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01; + #pragma unroll + for (int p = 0; p < 8; ++p) { + uint4 v = qv[p]; + dst_q[base + (p * 4 + 0) * ne01] = v.x; + dst_q[base + (p * 4 + 1) * ne01] = v.y; + dst_q[base + (p * 4 + 2) * ne01] = v.z; + dst_q[base + (p * 4 + 3) * ne01] = v.w; + } + + __global uchar * s_dst = dst_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE; + #pragma unroll + for (int i = 0; i < K_SCALE_SIZE; ++i) { + s_dst[i] = b->s[i]; + } +} + +kernel void kernel_restore_block_q4_k_trans4_ns( + __global uint * src_q, + __global half * src_d, + __global half * src_dm, + __global uchar * src_s, + __global struct block_q4_K * dst0, + uint ne00, + uint ne01, + uchar mask_0F, + uchar mask_F0 +) { + uint i00 = get_global_id(1); // block index along K + uint i01 = get_global_id(0); // row index + uint i02 = get_global_id(2); // batch index + + uint ne00_blk = ne00 / QK_K; + + uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; + uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; + + __global struct block_q4_K * b = dst0 + dst_blk_offset; + + b->d = src_d[src_blk_offset]; + b->dm = src_dm[src_blk_offset]; + + __global uchar * s_src = src_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE; + for (int i = 0; i < K_SCALE_SIZE; ++i) { + b->s[i] = s_src[i]; + } + + uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01; + + uint4 qv[8]; + for (int p = 0; p < 8; ++p) { + qv[p].x = src_q[base + (p * 4 + 0) * ne01]; + qv[p].y = src_q[base + (p * 4 + 1) * ne01]; + qv[p].z = src_q[base + (p * 4 + 2) * ne01]; + qv[p].w = src_q[base + (p * 4 + 3) * ne01]; + } + + uchar * qv_bytes = (uchar *)qv; + for (int i = 0; i < QK_K / 64; ++i) { + for (int j = 0; j < 16; ++j) { + uchar lo = qv_bytes[i*32 + j]; + uchar hi = qv_bytes[i*32 + j + 16]; + b->q[i*32 + 2*j] = convert_uchar((lo & mask_0F) | ((hi & mask_0F) << 4)); + b->q[i*32 + 2*j + 1] = convert_uchar(((lo & mask_F0) >> 4) | (hi & mask_F0)); + } + } +} + +kernel void kernel_convert_block_q5_k_trans4_ns( + __global struct block_q5_K * src0, + __global uint * dst_qs, + __global uint * dst_qh, + __global half * dst_d, + __global half * dst_dm, + __global uchar * dst_s, + uint ne00, + uint ne01, + uchar mask_0F, + uchar mask_F0 +) { + uint i00 = get_global_id(1); + uint i01 = get_global_id(0); + uint i02 = get_global_id(2); + + uint ne00_blk = ne00 / QK_K; + uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; + uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; + + __global struct block_q5_K * b = src0 + src_blk_offset; + + dst_d [dst_blk_offset] = b->d; + dst_dm[dst_blk_offset] = b->dm; + + for (int k = 0; k < 8; k++) { + uchar b0 = 0, b1 = 0, b2 = 0, b3 = 0; + for (int bit = 0; bit < 8; bit++) { + b0 |= (uchar)(((b->qh[bit] >> k) & 1) << bit); + b1 |= (uchar)(((b->qh[8 + bit] >> k) & 1) << bit); + b2 |= (uchar)(((b->qh[16 + bit] >> k) & 1) << bit); + b3 |= (uchar)(((b->qh[24 + bit] >> k) & 1) << bit); + } + uint packed = (uint)b0 | ((uint)b1 << 8) | ((uint)b2 << 16) | ((uint)b3 << 24); + dst_qh[i01 + (i00 * 8 + k) * ne01 + i02 * ne00_blk * 8 * ne01] = packed; + } + + uint4 qv[8]; + uchar * qv_bytes = (uchar *)qv; + for (int i = 0; i < QK_K / 64; ++i) { + for (int j = 0; j < 16; ++j) { + uchar x0 = b->qs[i*32 + 2*j]; + uchar x1 = b->qs[i*32 + 2*j + 1]; + + qv_bytes[i*32 + j ] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4); + qv_bytes[i*32 + j + 16] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0); + } + } + + uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01; + #pragma unroll + for (int p = 0; p < 8; ++p) { + uint4 v = qv[p]; + dst_qs[base + (p * 4 + 0) * ne01] = v.x; + dst_qs[base + (p * 4 + 1) * ne01] = v.y; + dst_qs[base + (p * 4 + 2) * ne01] = v.z; + dst_qs[base + (p * 4 + 3) * ne01] = v.w; + } + + __global uchar * s_dst = dst_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE; + #pragma unroll + for (int i = 0; i < K_SCALE_SIZE; ++i) { + s_dst[i] = b->s[i]; + } +} + +kernel void kernel_restore_block_q5_k_trans4_ns( + __global uint * src_qs, + __global uint * src_qh, + __global half * src_d, + __global half * src_dm, + __global uchar * src_s, + __global struct block_q5_K * dst0, + uint ne00, + uint ne01, + uchar mask_0F, + uchar mask_F0 +) { + uint i00 = get_global_id(1); // block index along K + uint i01 = get_global_id(0); // row index + uint i02 = get_global_id(2); // batch index + + uint ne00_blk = ne00 / QK_K; + + uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; + uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; + + __global struct block_q5_K * b = dst0 + dst_blk_offset; + + b->d = src_d[src_blk_offset]; + b->dm = src_dm[src_blk_offset]; + + for (int j = 0; j < 32; j++) b->qh[j] = 0; + for (int k = 0; k < 8; k++) { + uint packed = src_qh[i01 + (i00 * 8 + k) * ne01 + i02 * ne00_blk * 8 * ne01]; + uchar b0 = (uchar)(packed & 0xFF); + uchar b1 = (uchar)((packed >> 8) & 0xFF); + uchar b2 = (uchar)((packed >> 16) & 0xFF); + uchar b3 = (uchar)((packed >> 24) & 0xFF); + for (int bit = 0; bit < 8; bit++) { + b->qh[bit] |= (uchar)(((b0 >> bit) & 1) << k); + b->qh[8 + bit] |= (uchar)(((b1 >> bit) & 1) << k); + b->qh[16 + bit] |= (uchar)(((b2 >> bit) & 1) << k); + b->qh[24 + bit] |= (uchar)(((b3 >> bit) & 1) << k); + } + } + + __global uchar * s_src = src_s + (i02 * ne01 + i01) * ne00_blk * K_SCALE_SIZE + i00 * K_SCALE_SIZE; + for (int i = 0; i < K_SCALE_SIZE; ++i) { + b->s[i] = s_src[i]; + } + + uint base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01; + + uint4 qv[8]; + for (int p = 0; p < 8; ++p) { + qv[p].x = src_qs[base + (p * 4 + 0) * ne01]; + qv[p].y = src_qs[base + (p * 4 + 1) * ne01]; + qv[p].z = src_qs[base + (p * 4 + 2) * ne01]; + qv[p].w = src_qs[base + (p * 4 + 3) * ne01]; + } + + uchar * qv_bytes = (uchar *)qv; + for (int i = 0; i < QK_K / 64; ++i) { + for (int j = 0; j < 16; ++j) { + uchar lo = qv_bytes[i*32 + j]; + uchar hi = qv_bytes[i*32 + j + 16]; + b->qs[i*32 + 2*j] = convert_uchar((lo & mask_0F) | ((hi & mask_0F) << 4)); + b->qs[i*32 + 2*j + 1] = convert_uchar(((lo & mask_F0) >> 4) | (hi & mask_F0)); + } + } +} + +kernel void kernel_convert_block_q6_k_trans4_ns( + __global struct block_q6_K * src0, + __global uint * dst_ql, + __global uint * dst_qh, + __global half * dst_d, + __global char * dst_s, + uint ne00, + uint ne01, + uchar mask_0F, + uchar mask_F0 +) { + uint i00 = get_global_id(1); + uint i01 = get_global_id(0); + uint i02 = get_global_id(2); + + uint ne00_blk = ne00 / QK_K; + + uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; + uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; + + __global struct block_q6_K * b = src0 + src_blk_offset; + + dst_d[dst_blk_offset] = b->d; + + uint4 qlv[8]; + uchar * qlv_bytes = (uchar *)qlv; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 16; ++j) { + uchar x0 = b->ql[i*64 + 2*j]; + uchar x1 = b->ql[i*64 + 2*j + 1]; + uchar x2 = b->ql[i*64 + 32 + 2*j]; + uchar x3 = b->ql[i*64 + 32 + 2*j + 1]; + qlv_bytes[i*64 + j ] = convert_uchar(x0 & mask_0F) | convert_uchar((x1 & mask_0F) << 4); + qlv_bytes[i*64 + j + 16] = convert_uchar(x2 & mask_0F) | convert_uchar((x3 & mask_0F) << 4); + qlv_bytes[i*64 + j + 32] = convert_uchar((x0 & mask_F0) >> 4) | convert_uchar(x1 & mask_F0); + qlv_bytes[i*64 + j + 48] = convert_uchar((x2 & mask_F0) >> 4) | convert_uchar(x3 & mask_F0); + } + } + + uint ql_base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01; + + #pragma unroll + for (int p = 0; p < 8; ++p) { + uint4 v = qlv[p]; + dst_ql[ql_base + (p * 4 + 0) * ne01] = v.x; + dst_ql[ql_base + (p * 4 + 1) * ne01] = v.y; + dst_ql[ql_base + (p * 4 + 2) * ne01] = v.z; + dst_ql[ql_base + (p * 4 + 3) * ne01] = v.w; + } + + uint qhv[16] = {0}; + + for (int n = 0; n < 2; ++n) { + for (int l = 0; l < 32; ++l) { + uchar h = b->qh[n*32 + l]; + int u = l / 16; + int bit_pos = (l % 16) * 2; + qhv[(n*4 + 0)*2 + u] |= ((uint)((h >> 0) & 0x03)) << bit_pos; + qhv[(n*4 + 1)*2 + u] |= ((uint)((h >> 2) & 0x03)) << bit_pos; + qhv[(n*4 + 2)*2 + u] |= ((uint)((h >> 4) & 0x03)) << bit_pos; + qhv[(n*4 + 3)*2 + u] |= ((uint)((h >> 6) & 0x03)) << bit_pos; + } + } + + uint qh_base = i02 * ne00_blk * ne01 * 16 + i00 * ne01 * 16 + i01; + + for (int p = 0; p < 16; ++p) { + dst_qh[qh_base + p * ne01] = qhv[p]; + } + + __global char * s_dst = dst_s + (i02 * ne01 + i01) * ne00_blk * 16 + i00 * 16; + #pragma unroll + for (int i = 0; i < 16; ++i) { + s_dst[i] = b->scales[i]; + } +} + +kernel void kernel_restore_block_q6_k_trans4_ns( + __global uint * src_ql, + __global uint * src_qh, + __global half * src_d, + __global char * src_s, + __global struct block_q6_K * dst0, + uint ne00, + uint ne01, + uchar mask_0F, + uchar mask_F0 +) { + uint i00 = get_global_id(1); // block index along K + uint i01 = get_global_id(0); // row index + uint i02 = get_global_id(2); // batch index + + uint ne00_blk = ne00 / QK_K; + + uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01; + uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01; + + __global struct block_q6_K * b = dst0 + dst_blk_offset; + + b->d = src_d[src_blk_offset]; + + uint ql_base = i02 * ne00_blk * ne01 * 32 + i00 * ne01 * 32 + i01; + uint4 qlv[8]; + for (int p = 0; p < 8; ++p) { + qlv[p].x = src_ql[ql_base + (p * 4 + 0) * ne01]; + qlv[p].y = src_ql[ql_base + (p * 4 + 1) * ne01]; + qlv[p].z = src_ql[ql_base + (p * 4 + 2) * ne01]; + qlv[p].w = src_ql[ql_base + (p * 4 + 3) * ne01]; + } + + uchar * qlv_bytes = (uchar *)qlv; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 16; ++j) { + uchar lo_02 = qlv_bytes[i*64 + j]; + uchar lo_13 = qlv_bytes[i*64 + j + 16]; + uchar hi_02 = qlv_bytes[i*64 + j + 32]; + uchar hi_13 = qlv_bytes[i*64 + j + 48]; + b->ql[i*64 + 2*j] = convert_uchar((lo_02 & mask_0F) | ((hi_02 & mask_0F) << 4)); + b->ql[i*64 + 2*j + 1] = convert_uchar(((lo_02 & mask_F0) >> 4) | (hi_02 & mask_F0)); + b->ql[i*64 + 32 + 2*j] = convert_uchar((lo_13 & mask_0F) | ((hi_13 & mask_0F) << 4)); + b->ql[i*64 + 32 + 2*j + 1] = convert_uchar(((lo_13 & mask_F0) >> 4) | (hi_13 & mask_F0)); + } + } + + uint qh_base = i02 * ne00_blk * ne01 * 16 + i00 * ne01 * 16 + i01; + uint qhv[16]; + for (int p = 0; p < 16; ++p) { + qhv[p] = src_qh[qh_base + p * ne01]; + } + + for (int n = 0; n < 2; ++n) { + for (int l = 0; l < 32; ++l) { + int u = l / 16; + int bit_pos = (l % 16) * 2; + uchar v0 = (uchar)((qhv[(n*4 + 0)*2 + u] >> bit_pos) & 0x03); + uchar v1 = (uchar)((qhv[(n*4 + 1)*2 + u] >> bit_pos) & 0x03); + uchar v2 = (uchar)((qhv[(n*4 + 2)*2 + u] >> bit_pos) & 0x03); + uchar v3 = (uchar)((qhv[(n*4 + 3)*2 + u] >> bit_pos) & 0x03); + b->qh[n*32 + l] = v0 | (v1 << 2) | (v2 << 4) | (v3 << 6); + } + } + + __global char * s_src = src_s + (i02 * ne01 + i01) * ne00_blk * 16 + i00 * 16; + for (int i = 0; i < 16; ++i) { + b->scales[i] = s_src[i]; + } +} + //------------------------------------------------------------------------------ // block_mxfp4 //------------------------------------------------------------------------------ diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl new file mode 100644 index 000000000..9d24aff6a --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl @@ -0,0 +1,279 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable +#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable +#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable + +#define TILESIZE_K 16 +#define TILESIZE_M 64 +#define TILESIZE_N 32 +#define QK_K 256 +#define K_SCALE_SIZE 12 + +inline void get_scale_min_k4( + int j, + global const uchar * q, + uchar * d, + uchar * m +) { + if (j < 4) { + *d = q[j] & 63; + *m = q[j+4] & 63; + } else { + *d = (q[j+4] & 0x0F) | ((q[j-4] & 0xC0) >> 2); + *m = ((q[j+4] >> 4) & 0x0F) | ((q[j] & 0xC0) >> 2); + } +} + +#define dequantize_q4_k(q4, a_f16, scale, minv) \ + a_f16.s0 = (half)((float)(q4.s0 & 0x000F) * scale - minv); \ + a_f16.s1 = (half)((float)((q4.s0 & 0x00F0) >> 4) * scale - minv); \ + a_f16.s2 = (half)((float)((q4.s0 & 0x0F00) >> 8) * scale - minv); \ + a_f16.s3 = (half)((float)((q4.s0 & 0xF000) >> 12) * scale - minv); \ + a_f16.s4 = (half)((float)(q4.s1 & 0x000F) * scale - minv); \ + a_f16.s5 = (half)((float)((q4.s1 & 0x00F0) >> 4) * scale - minv); \ + a_f16.s6 = (half)((float)((q4.s1 & 0x0F00) >> 8) * scale - minv); \ + a_f16.s7 = (half)((float)((q4.s1 & 0xF000) >> 12) * scale - minv); \ + a_f16.s8 = (half)((float)(q4.s2 & 0x000F) * scale - minv); \ + a_f16.s9 = (half)((float)((q4.s2 & 0x00F0) >> 4) * scale - minv); \ + a_f16.sa = (half)((float)((q4.s2 & 0x0F00) >> 8) * scale - minv); \ + a_f16.sb = (half)((float)((q4.s2 & 0xF000) >> 12) * scale - minv); \ + a_f16.sc = (half)((float)(q4.s3 & 0x000F) * scale - minv); \ + a_f16.sd = (half)((float)((q4.s3 & 0x00F0) >> 4) * scale - minv); \ + a_f16.se = (half)((float)((q4.s3 & 0x0F00) >> 8) * scale - minv); \ + a_f16.sf = (half)((float)((q4.s3 & 0xF000) >> 12) * scale - minv); \ + + +#define dotx16_reduce8(a_reg, b_lm, c_reg, lm_offset) \ + acc.s0 = dot(a_reg.s0123, b_lm[lm_offset + 0]); \ + acc.s1 = dot(a_reg.s0123, b_lm[lm_offset + 1]); \ + acc.s2 = dot(a_reg.s0123, b_lm[lm_offset + 2]); \ + acc.s3 = dot(a_reg.s0123, b_lm[lm_offset + 3]); \ + acc.s4 = dot(a_reg.s0123, b_lm[lm_offset + 4]); \ + acc.s5 = dot(a_reg.s0123, b_lm[lm_offset + 5]); \ + acc.s6 = dot(a_reg.s0123, b_lm[lm_offset + 6]); \ + acc.s7 = dot(a_reg.s0123, b_lm[lm_offset + 7]); \ + acc.s8 = dot(a_reg.s0123, b_lm[lm_offset + 8]); \ + acc.s9 = dot(a_reg.s0123, b_lm[lm_offset + 9]); \ + acc.sa = dot(a_reg.s0123, b_lm[lm_offset + 10]); \ + acc.sb = dot(a_reg.s0123, b_lm[lm_offset + 11]); \ + acc.sc = dot(a_reg.s0123, b_lm[lm_offset + 12]); \ + acc.sd = dot(a_reg.s0123, b_lm[lm_offset + 13]); \ + acc.se = dot(a_reg.s0123, b_lm[lm_offset + 14]); \ + acc.sf = dot(a_reg.s0123, b_lm[lm_offset + 15]); \ + acc.s0 += dot(a_reg.s4567, b_lm[lm_offset + 32]); \ + acc.s1 += dot(a_reg.s4567, b_lm[lm_offset + 33]); \ + acc.s2 += dot(a_reg.s4567, b_lm[lm_offset + 34]); \ + acc.s3 += dot(a_reg.s4567, b_lm[lm_offset + 35]); \ + acc.s4 += dot(a_reg.s4567, b_lm[lm_offset + 36]); \ + acc.s5 += dot(a_reg.s4567, b_lm[lm_offset + 37]); \ + acc.s6 += dot(a_reg.s4567, b_lm[lm_offset + 38]); \ + acc.s7 += dot(a_reg.s4567, b_lm[lm_offset + 39]); \ + acc.s8 += dot(a_reg.s4567, b_lm[lm_offset + 40]); \ + acc.s9 += dot(a_reg.s4567, b_lm[lm_offset + 41]); \ + acc.sa += dot(a_reg.s4567, b_lm[lm_offset + 42]); \ + acc.sb += dot(a_reg.s4567, b_lm[lm_offset + 43]); \ + acc.sc += dot(a_reg.s4567, b_lm[lm_offset + 44]); \ + acc.sd += dot(a_reg.s4567, b_lm[lm_offset + 45]); \ + acc.se += dot(a_reg.s4567, b_lm[lm_offset + 46]); \ + acc.sf += dot(a_reg.s4567, b_lm[lm_offset + 47]); \ + c_reg.lo += convert_float8(acc.lo); \ + c_reg.hi += convert_float8(acc.hi); \ + acc.s0 = dot(a_reg.s89ab, b_lm[lm_offset + 64]); \ + acc.s1 = dot(a_reg.s89ab, b_lm[lm_offset + 65]); \ + acc.s2 = dot(a_reg.s89ab, b_lm[lm_offset + 66]); \ + acc.s3 = dot(a_reg.s89ab, b_lm[lm_offset + 67]); \ + acc.s4 = dot(a_reg.s89ab, b_lm[lm_offset + 68]); \ + acc.s5 = dot(a_reg.s89ab, b_lm[lm_offset + 69]); \ + acc.s6 = dot(a_reg.s89ab, b_lm[lm_offset + 70]); \ + acc.s7 = dot(a_reg.s89ab, b_lm[lm_offset + 71]); \ + acc.s8 = dot(a_reg.s89ab, b_lm[lm_offset + 72]); \ + acc.s9 = dot(a_reg.s89ab, b_lm[lm_offset + 73]); \ + acc.sa = dot(a_reg.s89ab, b_lm[lm_offset + 74]); \ + acc.sb = dot(a_reg.s89ab, b_lm[lm_offset + 75]); \ + acc.sc = dot(a_reg.s89ab, b_lm[lm_offset + 76]); \ + acc.sd = dot(a_reg.s89ab, b_lm[lm_offset + 77]); \ + acc.se = dot(a_reg.s89ab, b_lm[lm_offset + 78]); \ + acc.sf = dot(a_reg.s89ab, b_lm[lm_offset + 79]); \ + acc.s0 += dot(a_reg.scdef, b_lm[lm_offset + 96]); \ + acc.s1 += dot(a_reg.scdef, b_lm[lm_offset + 97]); \ + acc.s2 += dot(a_reg.scdef, b_lm[lm_offset + 98]); \ + acc.s3 += dot(a_reg.scdef, b_lm[lm_offset + 99]); \ + acc.s4 += dot(a_reg.scdef, b_lm[lm_offset + 100]); \ + acc.s5 += dot(a_reg.scdef, b_lm[lm_offset + 101]); \ + acc.s6 += dot(a_reg.scdef, b_lm[lm_offset + 102]); \ + acc.s7 += dot(a_reg.scdef, b_lm[lm_offset + 103]); \ + acc.s8 += dot(a_reg.scdef, b_lm[lm_offset + 104]); \ + acc.s9 += dot(a_reg.scdef, b_lm[lm_offset + 105]); \ + acc.sa += dot(a_reg.scdef, b_lm[lm_offset + 106]); \ + acc.sb += dot(a_reg.scdef, b_lm[lm_offset + 107]); \ + acc.sc += dot(a_reg.scdef, b_lm[lm_offset + 108]); \ + acc.sd += dot(a_reg.scdef, b_lm[lm_offset + 109]); \ + acc.se += dot(a_reg.scdef, b_lm[lm_offset + 110]); \ + acc.sf += dot(a_reg.scdef, b_lm[lm_offset + 111]); \ + c_reg.lo += convert_float8(acc.lo); \ + c_reg.hi += convert_float8(acc.hi); \ + + +__attribute__((qcom_wave_pair_mode(1))) +kernel void kernel_gemm_moe_q4_k_f32_ns( + __read_only image1d_buffer_t src0_q, + __global half * src0_d, + __global half * src0_dm, + __global uchar * src0_s, + __read_only image1d_buffer_t src1, + __global uint * src2, + __global ushort * src2_emap, + __write_only image1d_buffer_t dst, + __global int * total_tiles, + uint ne00, + uint ne01 +) { + uint block_id_m = get_global_id(1); // m_tile + uint block_id_n = get_global_id(2); // n_tile + + // Boundary check + if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) { + return; + } + + __private half16 reg_a; + __private float32 reg_c = (float32)(0); + __local half4 shared_b[128]; + + const ushort expert_id = src2_emap[block_id_n]; + + const uint row = block_id_m * TILESIZE_M; + const uint col = block_id_n * TILESIZE_N; + + uint sub_block_id_m = get_local_id(0); + uint2 b_global_offset; + b_global_offset.x = ((sub_block_id_m & 3) << 2) + (sub_block_id_m >> 2) * ne00; + b_global_offset.y = b_global_offset.x + (16 * ne00); + uint2 b_local_offset; + b_local_offset.x = (sub_block_id_m & 3) * 32 + (sub_block_id_m >> 2); + b_local_offset.y = b_local_offset.x + 16; + + uint num_superblocks = ne00 / QK_K; + uint scales_per_row = num_superblocks * K_SCALE_SIZE; + uint row_idx = row + get_global_id(0); + + // Loop along K axis, 32 elements per iteration (one sub-block), divided into 2 halves of 16 + for (uint step = 0; step < ne00; step += TILESIZE_K * 2) { + uint sub = step / 32; + uint sb = sub / 8; + uint j = sub % 8; + + // Load d and dm for super-block + uint d_offset = row + sb * ne01 + expert_id * num_superblocks * ne01 + get_global_id(0); + half d_val = src0_d[d_offset]; + half dm_val = src0_dm[d_offset]; + + // Load sub-block scale and min + global const uchar * sc = src0_s + (expert_id * ne01 + row_idx) * scales_per_row + sb * K_SCALE_SIZE; + uchar sv, mn; + get_scale_min_k4(j, sc, &sv, &mn); + + float scale = (float)d_val * (float)sv; + float minv = (float)dm_val * (float)mn; + + // First sub-block (16 elements) + uint q_sub_offset = row + ((ne01 * step) >> 3) + ((expert_id * ne00 * ne01) >> 3); + uint b_sub_offset = col * ne00 + step; + + // Load 16 q (64-bits) in transposed layout + uint2 q4x16; + q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x; + q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x; + + // Load 16x32 floats from matrix B + float8 bx8_f32; + bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4); + bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4); + half8 bx8_f16 = convert_half8(bx8_f32); + shared_b[b_local_offset.x] = bx8_f16.lo; + shared_b[b_local_offset.y] = bx8_f16.hi; + + // Dequantization + dequantize_q4_k(as_ushort4(q4x16), reg_a, scale, minv); + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + half16 acc; + dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0); + dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); + + // Second half (next 16 elements, same sub-block scale) + uint half_step = step + TILESIZE_K; + q_sub_offset = row + ((ne01 * half_step) >> 3) + ((expert_id * ne00 * ne01) >> 3); + b_sub_offset = col * ne00 + half_step; + + q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x; + q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x; + + bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4); + bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4); + bx8_f16 = convert_half8(bx8_f32); + shared_b[b_local_offset.x] = bx8_f16.lo; + shared_b[b_local_offset.y] = bx8_f16.hi; + + dequantize_q4_k(as_ushort4(q4x16), reg_a, scale, minv); + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0); + dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); + } + + // Load post router and share in LM + __local uint out_idx[TILESIZE_N]; + + if (get_local_id(0) < TILESIZE_N) { + uint idx = src2[block_id_n * TILESIZE_N + get_local_id(0)]; + if (idx == 0xFFFFFFFF) { + idx = src2[block_id_n * TILESIZE_N + 0]; + } + out_idx[get_local_id(0)] = idx * ne01; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // Scatter results back to original position in output grid + uint m_offset = row + get_local_id(0); + + write_imagef(dst, out_idx[1] + m_offset, (reg_c.s1)); + write_imagef(dst, out_idx[2] + m_offset, (reg_c.s2)); + write_imagef(dst, out_idx[3] + m_offset, (reg_c.s3)); + write_imagef(dst, out_idx[4] + m_offset, (reg_c.s4)); + write_imagef(dst, out_idx[5] + m_offset, (reg_c.s5)); + write_imagef(dst, out_idx[6] + m_offset, (reg_c.s6)); + write_imagef(dst, out_idx[7] + m_offset, (reg_c.s7)); + write_imagef(dst, out_idx[8] + m_offset, (reg_c.s8)); + write_imagef(dst, out_idx[9] + m_offset, (reg_c.s9)); + write_imagef(dst, out_idx[10] + m_offset, (reg_c.sa)); + write_imagef(dst, out_idx[11] + m_offset, (reg_c.sb)); + write_imagef(dst, out_idx[12] + m_offset, (reg_c.sc)); + write_imagef(dst, out_idx[13] + m_offset, (reg_c.sd)); + write_imagef(dst, out_idx[14] + m_offset, (reg_c.se)); + write_imagef(dst, out_idx[15] + m_offset, (reg_c.sf)); + write_imagef(dst, out_idx[16] + m_offset, (reg_c.sg)); + write_imagef(dst, out_idx[17] + m_offset, (reg_c.sh)); + write_imagef(dst, out_idx[18] + m_offset, (reg_c.si)); + write_imagef(dst, out_idx[19] + m_offset, (reg_c.sj)); + write_imagef(dst, out_idx[20] + m_offset, (reg_c.sk)); + write_imagef(dst, out_idx[21] + m_offset, (reg_c.sl)); + write_imagef(dst, out_idx[22] + m_offset, (reg_c.sm)); + write_imagef(dst, out_idx[23] + m_offset, (reg_c.sn)); + write_imagef(dst, out_idx[24] + m_offset, (reg_c.so)); + write_imagef(dst, out_idx[25] + m_offset, (reg_c.sp)); + write_imagef(dst, out_idx[26] + m_offset, (reg_c.sq)); + write_imagef(dst, out_idx[27] + m_offset, (reg_c.sr)); + write_imagef(dst, out_idx[28] + m_offset, (reg_c.ss)); + write_imagef(dst, out_idx[29] + m_offset, (reg_c.st)); + write_imagef(dst, out_idx[30] + m_offset, (reg_c.su)); + write_imagef(dst, out_idx[31] + m_offset, (reg_c.sv)); + + // Store zero padding parts to the index of first output in tile + barrier(CLK_GLOBAL_MEM_FENCE); + write_imagef(dst, out_idx[0] + m_offset, (reg_c.s0)); +} diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl new file mode 100644 index 000000000..808a0c7db --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl @@ -0,0 +1,284 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable +#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable +#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable + +#define TILESIZE_K 16 +#define TILESIZE_M 64 +#define TILESIZE_N 32 +#define QK_K 256 +#define K_SCALE_SIZE 12 + +inline void get_scale_min_k4( + int j, + global const uchar * q, + uchar * d, + uchar * m +) { + if (j < 4) { + *d = q[j] & 63; + *m = q[j+4] & 63; + } else { + *d = (q[j+4] & 0x0F) | ((q[j-4] & 0xC0) >> 2); + *m = ((q[j+4] >> 4) & 0x0F) | ((q[j] & 0xC0) >> 2); + } +} + +#define dequantize_q5_k(qs5x16, qh5x16, a_f16, scale, m) \ + a_f16.s0 = (half)((float)(( qs5x16.s0 & 0x000F) | (( qh5x16.s0 & 0x01) << 4)) * scale + m); \ + a_f16.s1 = (half)((float)((((qs5x16.s0 & 0x00F0) >> 4 ) | (((qh5x16.s0 >> 1) & 0x01) << 4)) * scale + m)); \ + a_f16.s2 = (half)((float)((((qs5x16.s0 & 0x0F00) >> 8 ) | (((qh5x16.s0 >> 2) & 0x01) << 4)) * scale + m)); \ + a_f16.s3 = (half)((float)((((qs5x16.s0 & 0xF000) >> 12) | (((qh5x16.s0 >> 3) & 0x01) << 4)) * scale + m)); \ + a_f16.s4 = (half)((float)((( qs5x16.s1 & 0x000F) | (((qh5x16.s0 >> 4) & 0x01) << 4)) * scale + m)); \ + a_f16.s5 = (half)((float)((((qs5x16.s1 & 0x00F0) >> 4 ) | (((qh5x16.s0 >> 5) & 0x01) << 4)) * scale + m)); \ + a_f16.s6 = (half)((float)(((qs5x16.s1 & 0x0F00) >> 8 ) | (((qh5x16.s0 >> 6) & 0x01) << 4)) * scale + m); \ + a_f16.s7 = (half)((float)((((qs5x16.s1 & 0xF000) >> 12) | (((qh5x16.s0 >> 7) & 0x01) << 4)) * scale + m)); \ + a_f16.s8 = (half)((float)((( qs5x16.s2 & 0x000F) | (( qh5x16.s1 & 0x01) << 4)) * scale + m)); \ + a_f16.s9 = (half)((float)((((qs5x16.s2 & 0x00F0) >> 4 ) | (((qh5x16.s1 >> 1) & 0x01) << 4)) * scale + m)); \ + a_f16.sa = (half)((float)((((qs5x16.s2 & 0x0F00) >> 8 ) | (((qh5x16.s1 >> 2) & 0x01) << 4)) * scale + m)); \ + a_f16.sb = (half)((float)((((qs5x16.s2 & 0xF000) >> 12) | (((qh5x16.s1 >> 3) & 0x01) << 4)) * scale + m)); \ + a_f16.sc = (half)((float)((( qs5x16.s3 & 0x000F) | (((qh5x16.s1 >> 4) & 0x01) << 4)) * scale + m)); \ + a_f16.sd = (half)((float)((((qs5x16.s3 & 0x00F0) >> 4 ) | (((qh5x16.s1 >> 5) & 0x01) << 4)) * scale + m)); \ + a_f16.se = (half)((float)((((qs5x16.s3 & 0x0F00) >> 8 ) | (((qh5x16.s1 >> 6) & 0x01) << 4)) * scale + m)); \ + a_f16.sf = (half)((float)((((qs5x16.s3 & 0xF000) >> 12) | (((qh5x16.s1 >> 7) & 0x01) << 4)) * scale + m)); \ + + +#define dotx16_reduce8(a_reg, b_lm, c_reg, lm_offset) \ + acc.s0 = dot(a_reg.s0123, b_lm[lm_offset + 0]); \ + acc.s1 = dot(a_reg.s0123, b_lm[lm_offset + 1]); \ + acc.s2 = dot(a_reg.s0123, b_lm[lm_offset + 2]); \ + acc.s3 = dot(a_reg.s0123, b_lm[lm_offset + 3]); \ + acc.s4 = dot(a_reg.s0123, b_lm[lm_offset + 4]); \ + acc.s5 = dot(a_reg.s0123, b_lm[lm_offset + 5]); \ + acc.s6 = dot(a_reg.s0123, b_lm[lm_offset + 6]); \ + acc.s7 = dot(a_reg.s0123, b_lm[lm_offset + 7]); \ + acc.s8 = dot(a_reg.s0123, b_lm[lm_offset + 8]); \ + acc.s9 = dot(a_reg.s0123, b_lm[lm_offset + 9]); \ + acc.sa = dot(a_reg.s0123, b_lm[lm_offset + 10]); \ + acc.sb = dot(a_reg.s0123, b_lm[lm_offset + 11]); \ + acc.sc = dot(a_reg.s0123, b_lm[lm_offset + 12]); \ + acc.sd = dot(a_reg.s0123, b_lm[lm_offset + 13]); \ + acc.se = dot(a_reg.s0123, b_lm[lm_offset + 14]); \ + acc.sf = dot(a_reg.s0123, b_lm[lm_offset + 15]); \ + acc.s0 += dot(a_reg.s4567, b_lm[lm_offset + 32]); \ + acc.s1 += dot(a_reg.s4567, b_lm[lm_offset + 33]); \ + acc.s2 += dot(a_reg.s4567, b_lm[lm_offset + 34]); \ + acc.s3 += dot(a_reg.s4567, b_lm[lm_offset + 35]); \ + acc.s4 += dot(a_reg.s4567, b_lm[lm_offset + 36]); \ + acc.s5 += dot(a_reg.s4567, b_lm[lm_offset + 37]); \ + acc.s6 += dot(a_reg.s4567, b_lm[lm_offset + 38]); \ + acc.s7 += dot(a_reg.s4567, b_lm[lm_offset + 39]); \ + acc.s8 += dot(a_reg.s4567, b_lm[lm_offset + 40]); \ + acc.s9 += dot(a_reg.s4567, b_lm[lm_offset + 41]); \ + acc.sa += dot(a_reg.s4567, b_lm[lm_offset + 42]); \ + acc.sb += dot(a_reg.s4567, b_lm[lm_offset + 43]); \ + acc.sc += dot(a_reg.s4567, b_lm[lm_offset + 44]); \ + acc.sd += dot(a_reg.s4567, b_lm[lm_offset + 45]); \ + acc.se += dot(a_reg.s4567, b_lm[lm_offset + 46]); \ + acc.sf += dot(a_reg.s4567, b_lm[lm_offset + 47]); \ + c_reg.lo += convert_float8(acc.lo); \ + c_reg.hi += convert_float8(acc.hi); \ + acc.s0 = dot(a_reg.s89ab, b_lm[lm_offset + 64]); \ + acc.s1 = dot(a_reg.s89ab, b_lm[lm_offset + 65]); \ + acc.s2 = dot(a_reg.s89ab, b_lm[lm_offset + 66]); \ + acc.s3 = dot(a_reg.s89ab, b_lm[lm_offset + 67]); \ + acc.s4 = dot(a_reg.s89ab, b_lm[lm_offset + 68]); \ + acc.s5 = dot(a_reg.s89ab, b_lm[lm_offset + 69]); \ + acc.s6 = dot(a_reg.s89ab, b_lm[lm_offset + 70]); \ + acc.s7 = dot(a_reg.s89ab, b_lm[lm_offset + 71]); \ + acc.s8 = dot(a_reg.s89ab, b_lm[lm_offset + 72]); \ + acc.s9 = dot(a_reg.s89ab, b_lm[lm_offset + 73]); \ + acc.sa = dot(a_reg.s89ab, b_lm[lm_offset + 74]); \ + acc.sb = dot(a_reg.s89ab, b_lm[lm_offset + 75]); \ + acc.sc = dot(a_reg.s89ab, b_lm[lm_offset + 76]); \ + acc.sd = dot(a_reg.s89ab, b_lm[lm_offset + 77]); \ + acc.se = dot(a_reg.s89ab, b_lm[lm_offset + 78]); \ + acc.sf = dot(a_reg.s89ab, b_lm[lm_offset + 79]); \ + acc.s0 += dot(a_reg.scdef, b_lm[lm_offset + 96]); \ + acc.s1 += dot(a_reg.scdef, b_lm[lm_offset + 97]); \ + acc.s2 += dot(a_reg.scdef, b_lm[lm_offset + 98]); \ + acc.s3 += dot(a_reg.scdef, b_lm[lm_offset + 99]); \ + acc.s4 += dot(a_reg.scdef, b_lm[lm_offset + 100]); \ + acc.s5 += dot(a_reg.scdef, b_lm[lm_offset + 101]); \ + acc.s6 += dot(a_reg.scdef, b_lm[lm_offset + 102]); \ + acc.s7 += dot(a_reg.scdef, b_lm[lm_offset + 103]); \ + acc.s8 += dot(a_reg.scdef, b_lm[lm_offset + 104]); \ + acc.s9 += dot(a_reg.scdef, b_lm[lm_offset + 105]); \ + acc.sa += dot(a_reg.scdef, b_lm[lm_offset + 106]); \ + acc.sb += dot(a_reg.scdef, b_lm[lm_offset + 107]); \ + acc.sc += dot(a_reg.scdef, b_lm[lm_offset + 108]); \ + acc.sd += dot(a_reg.scdef, b_lm[lm_offset + 109]); \ + acc.se += dot(a_reg.scdef, b_lm[lm_offset + 110]); \ + acc.sf += dot(a_reg.scdef, b_lm[lm_offset + 111]); \ + c_reg.lo += convert_float8(acc.lo); \ + c_reg.hi += convert_float8(acc.hi); \ + + +__attribute__((qcom_wave_pair_mode(1))) +kernel void kernel_gemm_moe_q5_k_f32_ns( + __read_only image1d_buffer_t src0_q, + __global uint * src0_qh, + __global uchar * src0_s, + __global half * src0_d, + __global half * src0_dm, + __read_only image1d_buffer_t src1, + __global uint * src2, + __global ushort * src2_emap, + __write_only image1d_buffer_t dst, + __global int * total_tiles, + uint ne00, + uint ne01 +) { + uint block_id_m = get_global_id(1); // m_tile + uint block_id_n = get_global_id(2); // n_tile + + // Boundary check + if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) { + return; + } + + __private half16 reg_a; + __private float32 reg_c = (float32)(0); + __local half4 shared_b[128]; + + const ushort expert_id = src2_emap[block_id_n]; + + const uint row = block_id_m * TILESIZE_M; + const uint col = block_id_n * TILESIZE_N; + + uint sub_block_id_m = get_local_id(0); + uint2 b_global_offset; + b_global_offset.x = ((sub_block_id_m & 3) << 2) + (sub_block_id_m >> 2) * ne00; + b_global_offset.y = b_global_offset.x + (16 * ne00); + uint2 b_local_offset; + b_local_offset.x = (sub_block_id_m & 3) * 32 + (sub_block_id_m >> 2); + b_local_offset.y = b_local_offset.x + 16; + + uint num_superblocks = ne00 / QK_K; + uint scales_per_row = num_superblocks * K_SCALE_SIZE; + uint row_idx = row + get_global_id(0); + + // Loop along K axis, 32 elements per iteration (one sub-block), divided into 2 halves of 16 + for (uint step = 0; step < ne00; step += TILESIZE_K * 2) { + uint sub = step / 32; + uint sb = sub / 8; + uint j = sub % 8; + + // Load d and dm for super-block + uint d_offset = row + sb * ne01 + expert_id * num_superblocks * ne01 + get_global_id(0); + half d_val = src0_d[d_offset]; + half dm_val = src0_dm[d_offset]; + + // Load sub-block scale and min + global const uchar * sc = src0_s + (expert_id * ne01 + row_idx) * scales_per_row + sb * K_SCALE_SIZE; + uchar sv, mn; + get_scale_min_k4(j, sc, &sv, &mn); + + float scale = (float)d_val * (float)sv; + float minv = -(float)dm_val * (float)mn; + + // qh is stored at sub-block granularity + uint qh_offset = row + sub * ne01 + expert_id * num_superblocks * 8 * ne01 + get_global_id(0); + uchar4 qhx32 = as_uchar4(src0_qh[qh_offset]); + + // First sub-block (16 elements) + uint q_sub_offset = row + ((ne01 * step) >> 3) + ((expert_id * ne00 * ne01) >> 3); + uint b_sub_offset = col * ne00 + step; + + // Load 16 q (64-bits) in transposed layout + uint2 q4x16; + q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x; + q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x; + + // Load 16x32 floats from matrix B + float8 bx8_f32; + bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4); + bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4); + half8 bx8_f16 = convert_half8(bx8_f32); + shared_b[b_local_offset.x] = bx8_f16.lo; + shared_b[b_local_offset.y] = bx8_f16.hi; + + // Dequantization + dequantize_q5_k(as_ushort4(q4x16), qhx32.lo, reg_a, scale, minv); + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + half16 acc; + dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0); + dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); + + // Second half + uint half_step = step + TILESIZE_K; + q_sub_offset = row + ((ne01 * half_step) >> 3) + ((expert_id * ne00 * ne01) >> 3); + b_sub_offset = col * ne00 + half_step; + + q4x16.x = read_imageui(src0_q, q_sub_offset + sub_block_id_m).x; + q4x16.y = read_imageui(src0_q, q_sub_offset + sub_block_id_m + ne01).x; + + bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4); + bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4); + bx8_f16 = convert_half8(bx8_f32); + shared_b[b_local_offset.x] = bx8_f16.lo; + shared_b[b_local_offset.y] = bx8_f16.hi; + + dequantize_q5_k(as_ushort4(q4x16), qhx32.hi, reg_a, scale, minv); + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0); + dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); + } + + // Load post router and share in LM + __local uint out_idx[TILESIZE_N]; + + if (get_local_id(0) < TILESIZE_N) { + uint idx = src2[block_id_n * TILESIZE_N + get_local_id(0)]; + if (idx == 0xFFFFFFFF) { + idx = src2[block_id_n * TILESIZE_N + 0]; + } + out_idx[get_local_id(0)] = idx * ne01; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // Scatter results back to original position in output grid + uint m_offset = row + get_local_id(0); + + write_imagef(dst, out_idx[1] + m_offset, (reg_c.s1)); + write_imagef(dst, out_idx[2] + m_offset, (reg_c.s2)); + write_imagef(dst, out_idx[3] + m_offset, (reg_c.s3)); + write_imagef(dst, out_idx[4] + m_offset, (reg_c.s4)); + write_imagef(dst, out_idx[5] + m_offset, (reg_c.s5)); + write_imagef(dst, out_idx[6] + m_offset, (reg_c.s6)); + write_imagef(dst, out_idx[7] + m_offset, (reg_c.s7)); + write_imagef(dst, out_idx[8] + m_offset, (reg_c.s8)); + write_imagef(dst, out_idx[9] + m_offset, (reg_c.s9)); + write_imagef(dst, out_idx[10] + m_offset, (reg_c.sa)); + write_imagef(dst, out_idx[11] + m_offset, (reg_c.sb)); + write_imagef(dst, out_idx[12] + m_offset, (reg_c.sc)); + write_imagef(dst, out_idx[13] + m_offset, (reg_c.sd)); + write_imagef(dst, out_idx[14] + m_offset, (reg_c.se)); + write_imagef(dst, out_idx[15] + m_offset, (reg_c.sf)); + write_imagef(dst, out_idx[16] + m_offset, (reg_c.sg)); + write_imagef(dst, out_idx[17] + m_offset, (reg_c.sh)); + write_imagef(dst, out_idx[18] + m_offset, (reg_c.si)); + write_imagef(dst, out_idx[19] + m_offset, (reg_c.sj)); + write_imagef(dst, out_idx[20] + m_offset, (reg_c.sk)); + write_imagef(dst, out_idx[21] + m_offset, (reg_c.sl)); + write_imagef(dst, out_idx[22] + m_offset, (reg_c.sm)); + write_imagef(dst, out_idx[23] + m_offset, (reg_c.sn)); + write_imagef(dst, out_idx[24] + m_offset, (reg_c.so)); + write_imagef(dst, out_idx[25] + m_offset, (reg_c.sp)); + write_imagef(dst, out_idx[26] + m_offset, (reg_c.sq)); + write_imagef(dst, out_idx[27] + m_offset, (reg_c.sr)); + write_imagef(dst, out_idx[28] + m_offset, (reg_c.ss)); + write_imagef(dst, out_idx[29] + m_offset, (reg_c.st)); + write_imagef(dst, out_idx[30] + m_offset, (reg_c.su)); + write_imagef(dst, out_idx[31] + m_offset, (reg_c.sv)); + + // Store zero padding parts to the index of first output in tile + barrier(CLK_GLOBAL_MEM_FENCE); + write_imagef(dst, out_idx[0] + m_offset, (reg_c.s0)); +} diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl new file mode 100644 index 000000000..a040335ad --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl @@ -0,0 +1,263 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable +#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable +#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable + +#define TILESIZE_K 16 +#define TILESIZE_M 64 +#define TILESIZE_N 32 +#define QK_K 256 + +#define dequantize_q6_k(qs16, qh16, a_f16, scale) \ + a_f16.s0 = (half)(((float)(( qs16.s0 & 0x000F) | ((uint)(( qh16 ) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s1 = (half)(((float)((( qs16.s0 >> 4) & 0x000F) | ((uint)(( qh16 >> 2) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s2 = (half)(((float)((( qs16.s0 >> 8) & 0x000F) | ((uint)(( qh16 >> 4) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s3 = (half)(((float)((( qs16.s0 >>12) & 0x000F) | ((uint)(( qh16 >> 6) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s4 = (half)(((float)(( qs16.s1 & 0x000F) | ((uint)(( qh16 >> 8) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s5 = (half)(((float)((( qs16.s1 >> 4) & 0x000F) | ((uint)(( qh16 >> 10) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s6 = (half)(((float)((( qs16.s1 >> 8) & 0x000F) | ((uint)(( qh16 >> 12) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s7 = (half)(((float)((( qs16.s1 >>12) & 0x000F) | ((uint)(( qh16 >> 14) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s8 = (half)(((float)(( qs16.s2 & 0x000F) | ((uint)(( qh16 >> 16) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.s9 = (half)(((float)((( qs16.s2 >> 4) & 0x000F) | ((uint)(( qh16 >> 18) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.sa = (half)(((float)((( qs16.s2 >> 8) & 0x000F) | ((uint)(( qh16 >> 20) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.sb = (half)(((float)((( qs16.s2 >>12) & 0x000F) | ((uint)(( qh16 >> 22) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.sc = (half)(((float)(( qs16.s3 & 0x000F) | ((uint)(( qh16 >> 24) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.sd = (half)(((float)((( qs16.s3 >> 4) & 0x000F) | ((uint)(( qh16 >> 26) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.se = (half)(((float)((( qs16.s3 >> 8) & 0x000F) | ((uint)(( qh16 >> 28) & 0x3) << 4)) - 32.f) * scale); \ + a_f16.sf = (half)(((float)((( qs16.s3 >>12) & 0x000F) | ((uint)(( qh16 >> 30) & 0x3) << 4)) - 32.f) * scale); \ + + +#define dotx16_reduce8(a_reg, b_lm, c_reg, lm_offset) \ + acc.s0 = dot(a_reg.s0123, b_lm[lm_offset + 0]); \ + acc.s1 = dot(a_reg.s0123, b_lm[lm_offset + 1]); \ + acc.s2 = dot(a_reg.s0123, b_lm[lm_offset + 2]); \ + acc.s3 = dot(a_reg.s0123, b_lm[lm_offset + 3]); \ + acc.s4 = dot(a_reg.s0123, b_lm[lm_offset + 4]); \ + acc.s5 = dot(a_reg.s0123, b_lm[lm_offset + 5]); \ + acc.s6 = dot(a_reg.s0123, b_lm[lm_offset + 6]); \ + acc.s7 = dot(a_reg.s0123, b_lm[lm_offset + 7]); \ + acc.s8 = dot(a_reg.s0123, b_lm[lm_offset + 8]); \ + acc.s9 = dot(a_reg.s0123, b_lm[lm_offset + 9]); \ + acc.sa = dot(a_reg.s0123, b_lm[lm_offset + 10]); \ + acc.sb = dot(a_reg.s0123, b_lm[lm_offset + 11]); \ + acc.sc = dot(a_reg.s0123, b_lm[lm_offset + 12]); \ + acc.sd = dot(a_reg.s0123, b_lm[lm_offset + 13]); \ + acc.se = dot(a_reg.s0123, b_lm[lm_offset + 14]); \ + acc.sf = dot(a_reg.s0123, b_lm[lm_offset + 15]); \ + acc.s0 += dot(a_reg.s4567, b_lm[lm_offset + 32]); \ + acc.s1 += dot(a_reg.s4567, b_lm[lm_offset + 33]); \ + acc.s2 += dot(a_reg.s4567, b_lm[lm_offset + 34]); \ + acc.s3 += dot(a_reg.s4567, b_lm[lm_offset + 35]); \ + acc.s4 += dot(a_reg.s4567, b_lm[lm_offset + 36]); \ + acc.s5 += dot(a_reg.s4567, b_lm[lm_offset + 37]); \ + acc.s6 += dot(a_reg.s4567, b_lm[lm_offset + 38]); \ + acc.s7 += dot(a_reg.s4567, b_lm[lm_offset + 39]); \ + acc.s8 += dot(a_reg.s4567, b_lm[lm_offset + 40]); \ + acc.s9 += dot(a_reg.s4567, b_lm[lm_offset + 41]); \ + acc.sa += dot(a_reg.s4567, b_lm[lm_offset + 42]); \ + acc.sb += dot(a_reg.s4567, b_lm[lm_offset + 43]); \ + acc.sc += dot(a_reg.s4567, b_lm[lm_offset + 44]); \ + acc.sd += dot(a_reg.s4567, b_lm[lm_offset + 45]); \ + acc.se += dot(a_reg.s4567, b_lm[lm_offset + 46]); \ + acc.sf += dot(a_reg.s4567, b_lm[lm_offset + 47]); \ + c_reg.lo += convert_float8(acc.lo); \ + c_reg.hi += convert_float8(acc.hi); \ + acc.s0 = dot(a_reg.s89ab, b_lm[lm_offset + 64]); \ + acc.s1 = dot(a_reg.s89ab, b_lm[lm_offset + 65]); \ + acc.s2 = dot(a_reg.s89ab, b_lm[lm_offset + 66]); \ + acc.s3 = dot(a_reg.s89ab, b_lm[lm_offset + 67]); \ + acc.s4 = dot(a_reg.s89ab, b_lm[lm_offset + 68]); \ + acc.s5 = dot(a_reg.s89ab, b_lm[lm_offset + 69]); \ + acc.s6 = dot(a_reg.s89ab, b_lm[lm_offset + 70]); \ + acc.s7 = dot(a_reg.s89ab, b_lm[lm_offset + 71]); \ + acc.s8 = dot(a_reg.s89ab, b_lm[lm_offset + 72]); \ + acc.s9 = dot(a_reg.s89ab, b_lm[lm_offset + 73]); \ + acc.sa = dot(a_reg.s89ab, b_lm[lm_offset + 74]); \ + acc.sb = dot(a_reg.s89ab, b_lm[lm_offset + 75]); \ + acc.sc = dot(a_reg.s89ab, b_lm[lm_offset + 76]); \ + acc.sd = dot(a_reg.s89ab, b_lm[lm_offset + 77]); \ + acc.se = dot(a_reg.s89ab, b_lm[lm_offset + 78]); \ + acc.sf = dot(a_reg.s89ab, b_lm[lm_offset + 79]); \ + acc.s0 += dot(a_reg.scdef, b_lm[lm_offset + 96]); \ + acc.s1 += dot(a_reg.scdef, b_lm[lm_offset + 97]); \ + acc.s2 += dot(a_reg.scdef, b_lm[lm_offset + 98]); \ + acc.s3 += dot(a_reg.scdef, b_lm[lm_offset + 99]); \ + acc.s4 += dot(a_reg.scdef, b_lm[lm_offset + 100]); \ + acc.s5 += dot(a_reg.scdef, b_lm[lm_offset + 101]); \ + acc.s6 += dot(a_reg.scdef, b_lm[lm_offset + 102]); \ + acc.s7 += dot(a_reg.scdef, b_lm[lm_offset + 103]); \ + acc.s8 += dot(a_reg.scdef, b_lm[lm_offset + 104]); \ + acc.s9 += dot(a_reg.scdef, b_lm[lm_offset + 105]); \ + acc.sa += dot(a_reg.scdef, b_lm[lm_offset + 106]); \ + acc.sb += dot(a_reg.scdef, b_lm[lm_offset + 107]); \ + acc.sc += dot(a_reg.scdef, b_lm[lm_offset + 108]); \ + acc.sd += dot(a_reg.scdef, b_lm[lm_offset + 109]); \ + acc.se += dot(a_reg.scdef, b_lm[lm_offset + 110]); \ + acc.sf += dot(a_reg.scdef, b_lm[lm_offset + 111]); \ + c_reg.lo += convert_float8(acc.lo); \ + c_reg.hi += convert_float8(acc.hi); \ + + +__attribute__((qcom_wave_pair_mode(1))) +kernel void kernel_gemm_moe_q6_k_f32_ns( + __read_only image1d_buffer_t src0_ql, + __global uint * src0_qh, + __global char * src0_s, + __global half * src0_d, + __read_only image1d_buffer_t src1, + __global uint * src2, + __global ushort * src2_emap, + __write_only image1d_buffer_t dst, + __global int * total_tiles, + uint ne00, + uint ne01 +) { + uint block_id_m = get_global_id(1); // m_tile + uint block_id_n = get_global_id(2); // n_tile + + // Boundary check + if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) { + return; + } + + __private half16 reg_a; + __private float32 reg_c = (float32)(0); + __local half4 shared_b[128]; + + const ushort expert_id = src2_emap[block_id_n]; + + const uint row = block_id_m * TILESIZE_M; + const uint col = block_id_n * TILESIZE_N; + + uint sub_block_id_m = get_local_id(0); + uint2 b_global_offset; + b_global_offset.x = ((sub_block_id_m & 3) << 2) + (sub_block_id_m >> 2) * ne00; + b_global_offset.y = b_global_offset.x + (16 * ne00); + uint2 b_local_offset; + b_local_offset.x = (sub_block_id_m & 3) * 32 + (sub_block_id_m >> 2); + b_local_offset.y = b_local_offset.x + 16; + + uint num_superblocks = ne00 / QK_K; + uint scales_per_row = num_superblocks * 16; + uint row_idx = row + get_global_id(0); + + // Loop along K axis, 32 elements per iteration (one sub-block), divided into 2 halves of 16 + for (uint step = 0; step < ne00; step += TILESIZE_K * 2) { + uint sub = step / 32; // 32-element group index + uint sb = sub / 8; // super-block index + uint j = sub % 8; // group within super-block + + // Load d for super-block + uint d_offset = row + sb * ne01 + expert_id * num_superblocks * ne01 + get_global_id(0); + half d_val = src0_d[d_offset]; + + // Load sub-block scales + global const char * sc = src0_s + (expert_id * ne01 + row_idx) * scales_per_row + sb * 16; + float scale0 = (float)d_val * (float)sc[j * 2]; + float scale1 = (float)d_val * (float)sc[j * 2 + 1]; + + uint qh_base = row + (sub * 2) * ne01 + expert_id * (num_superblocks * 16) * ne01 + get_global_id(0); + uint qh_first16 = src0_qh[qh_base]; + uint qh_second16 = src0_qh[qh_base + ne01]; + + // First half (16 elements) + uint q_sub_offset = row + ((ne01 * step) >> 3) + ((expert_id * ne00 * ne01) >> 3); + uint b_sub_offset = col * ne00 + step; + + // Load 16 ql nibbles (2 uints) from image + uint2 q4x16; + q4x16.x = read_imageui(src0_ql, q_sub_offset + sub_block_id_m).x; + q4x16.y = read_imageui(src0_ql, q_sub_offset + sub_block_id_m + ne01).x; + + // Load 16x32 floats from matrix B + float8 bx8_f32; + bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4); + bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4); + half8 bx8_f16 = convert_half8(bx8_f32); + shared_b[b_local_offset.x] = bx8_f16.lo; + shared_b[b_local_offset.y] = bx8_f16.hi; + + // Dequantize first 16 elements (scale0) + dequantize_q6_k(as_ushort4(q4x16), qh_first16, reg_a, scale0); + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + half16 acc; + dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0); + dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); + + // Second half + uint half_step = step + TILESIZE_K; + q_sub_offset = row + ((ne01 * half_step) >> 3) + ((expert_id * ne00 * ne01) >> 3); + b_sub_offset = col * ne00 + half_step; + + q4x16.x = read_imageui(src0_ql, q_sub_offset + sub_block_id_m).x; + q4x16.y = read_imageui(src0_ql, q_sub_offset + sub_block_id_m + ne01).x; + + bx8_f32.lo = read_imagef(src1, (b_sub_offset + b_global_offset.x) / 4); + bx8_f32.hi = read_imagef(src1, (b_sub_offset + b_global_offset.y) / 4); + bx8_f16 = convert_half8(bx8_f32); + shared_b[b_local_offset.x] = bx8_f16.lo; + shared_b[b_local_offset.y] = bx8_f16.hi; + + dequantize_q6_k(as_ushort4(q4x16), qh_second16, reg_a, scale1); + + sub_group_barrier(CLK_LOCAL_MEM_FENCE); + + dotx16_reduce8(reg_a, shared_b, reg_c.lo, 0); + dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16); + } + + // Load post router and share in LM + __local uint out_idx[TILESIZE_N]; + + if (get_local_id(0) < TILESIZE_N) { + uint idx = src2[block_id_n * TILESIZE_N + get_local_id(0)]; + if (idx == 0xFFFFFFFF) { + idx = src2[block_id_n * TILESIZE_N + 0]; + } + out_idx[get_local_id(0)] = idx * ne01; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // Scatter results back to original position in output grid + uint m_offset = row + get_local_id(0); + + write_imagef(dst, out_idx[1] + m_offset, (reg_c.s1)); + write_imagef(dst, out_idx[2] + m_offset, (reg_c.s2)); + write_imagef(dst, out_idx[3] + m_offset, (reg_c.s3)); + write_imagef(dst, out_idx[4] + m_offset, (reg_c.s4)); + write_imagef(dst, out_idx[5] + m_offset, (reg_c.s5)); + write_imagef(dst, out_idx[6] + m_offset, (reg_c.s6)); + write_imagef(dst, out_idx[7] + m_offset, (reg_c.s7)); + write_imagef(dst, out_idx[8] + m_offset, (reg_c.s8)); + write_imagef(dst, out_idx[9] + m_offset, (reg_c.s9)); + write_imagef(dst, out_idx[10] + m_offset, (reg_c.sa)); + write_imagef(dst, out_idx[11] + m_offset, (reg_c.sb)); + write_imagef(dst, out_idx[12] + m_offset, (reg_c.sc)); + write_imagef(dst, out_idx[13] + m_offset, (reg_c.sd)); + write_imagef(dst, out_idx[14] + m_offset, (reg_c.se)); + write_imagef(dst, out_idx[15] + m_offset, (reg_c.sf)); + write_imagef(dst, out_idx[16] + m_offset, (reg_c.sg)); + write_imagef(dst, out_idx[17] + m_offset, (reg_c.sh)); + write_imagef(dst, out_idx[18] + m_offset, (reg_c.si)); + write_imagef(dst, out_idx[19] + m_offset, (reg_c.sj)); + write_imagef(dst, out_idx[20] + m_offset, (reg_c.sk)); + write_imagef(dst, out_idx[21] + m_offset, (reg_c.sl)); + write_imagef(dst, out_idx[22] + m_offset, (reg_c.sm)); + write_imagef(dst, out_idx[23] + m_offset, (reg_c.sn)); + write_imagef(dst, out_idx[24] + m_offset, (reg_c.so)); + write_imagef(dst, out_idx[25] + m_offset, (reg_c.sp)); + write_imagef(dst, out_idx[26] + m_offset, (reg_c.sq)); + write_imagef(dst, out_idx[27] + m_offset, (reg_c.sr)); + write_imagef(dst, out_idx[28] + m_offset, (reg_c.ss)); + write_imagef(dst, out_idx[29] + m_offset, (reg_c.st)); + write_imagef(dst, out_idx[30] + m_offset, (reg_c.su)); + write_imagef(dst, out_idx[31] + m_offset, (reg_c.sv)); + + // Store zero padding parts to the index of first output in tile + barrier(CLK_GLOBAL_MEM_FENCE); + write_imagef(dst, out_idx[0] + m_offset, (reg_c.s0)); +} diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl new file mode 100644 index 000000000..13d79f252 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl @@ -0,0 +1,151 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable + +#define QK_K 256 +#define K_SCALE_SIZE 12 +#define N_SIMDGROUP 4 +#define SIMDGROUP_WIDTH 64 + +inline void get_scale_min_k4( + int j, + global const uchar * q, + uchar * d, + uchar * m +) { + if (j < 4) { + *d = q[j] & 63; + *m = q[j+4] & 63; + } else { + *d = (q[j+4] & 0x0F) | ((q[j-4] & 0xC0) >> 2); + *m = ((q[j+4] >> 4) & 0x0F) | ((q[j] & 0xC0) >> 2); + } +} + +static inline float8 q4_k_to_fp32_packed8(ushort2 q4x8, float scale, float minv) { + float8 fp32x8; + fp32x8.s0 = (q4x8.s0 & 0x000F) * scale - minv; + fp32x8.s1 = ((q4x8.s0 & 0x00F0) >> 4) * scale - minv; + fp32x8.s2 = ((q4x8.s0 & 0x0F00) >> 8) * scale - minv; + fp32x8.s3 = ((q4x8.s0 & 0xF000) >> 12) * scale - minv; + fp32x8.s4 = (q4x8.s1 & 0x000F) * scale - minv; + fp32x8.s5 = ((q4x8.s1 & 0x00F0) >> 4) * scale - minv; + fp32x8.s6 = ((q4x8.s1 & 0x0F00) >> 8) * scale - minv; + fp32x8.s7 = ((q4x8.s1 & 0xF000) >> 12) * scale - minv; + return fp32x8; +} + +__attribute__((qcom_reqd_sub_group_size("half"))) +__kernel void kernel_gemv_moe_q4_k_f32_ns( + __global uint * src0_q, + __global half * src0_d, + __global half * src0_dm, + __global uchar * src0_s, + __read_only image1d_buffer_t src1, + __global uint * src2, + __global float * dst, + ulong offsetd, + int ne00, + int ne01, + int ne11 +) { + uint i01 = get_global_id(0); + uint i20 = get_global_id(2); + uint sgid = get_local_id(1); + uint slid = get_sub_group_local_id(); + + uint i11 = i20 % ne11; + + uint expert_id = src2[i20]; + + int num_superblocks = ne00 / QK_K; + int num_subblocks = ne00 / 32; + int scales_per_row = num_superblocks * K_SCALE_SIZE; + + // Expert offsets in the transposed noshuffle layout + uint expert_q_offset = expert_id * (ne00 / 8) * ne01; + uint expert_d_offset = expert_id * num_superblocks * ne01; + + __private float sum = 0.0f; + + // Loop over sub-blocks of 32 elements, N_SIMDGROUP sub-blocks per iter + for (uint ib = sgid; ib < num_subblocks; ib += N_SIMDGROUP) { + uint sb = ib / 8; + uint j = ib % 8; + + // Load d and dmin for this super-block + half d_val = src0_d[expert_d_offset + sb * ne01 + i01]; + half dm_val = src0_dm[expert_d_offset + sb * ne01 + i01]; + + // Load sub-block scale and min + global const uchar * sc = src0_s + (expert_id * ne01 + i01) * scales_per_row + sb * K_SCALE_SIZE; + uchar sv, mn; + get_scale_min_k4(j, sc, &sv, &mn); + + float scale = (float)d_val * (float)sv; + float minv = (float)dm_val * (float)mn; + + // Load 4 uints of quants (32 nibbles = 32 elements) + uint q_base = expert_q_offset + ib * ne01 * 4 + i01; + + uint4 regQ; + regQ.s0 = src0_q[q_base]; + regQ.s1 = src0_q[q_base + ne01]; + regQ.s2 = src0_q[q_base + ne01 * 2]; + regQ.s3 = src0_q[q_base + ne01 * 3]; + + // Load activations: 32 floats = 8 float4s + uint y_offset = i11 * ne00 / 4 + ib * 8; + + float8 fp32x8 = q4_k_to_fp32_packed8(as_ushort2(regQ.s0), scale, minv); + + float4 shared_y4; + shared_y4 = read_imagef(src1, (y_offset + 0)); + float4 acc = shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 1)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q4_k_to_fp32_packed8(as_ushort2(regQ.s1), scale, minv); + + shared_y4 = read_imagef(src1, (y_offset + 2)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 3)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q4_k_to_fp32_packed8(as_ushort2(regQ.s2), scale, minv); + + shared_y4 = read_imagef(src1, (y_offset + 4)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 5)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q4_k_to_fp32_packed8(as_ushort2(regQ.s3), scale, minv); + + shared_y4 = read_imagef(src1, (y_offset + 6)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 7)); + acc += shared_y4 * fp32x8.hi; + + sum += ((acc.s0 + acc.s1) + (acc.s2 + acc.s3)); + } + + // reduction in local memory, assumes #subgroups=4 + __local float reduceLM[SIMDGROUP_WIDTH * (N_SIMDGROUP - 1)]; + if (sgid == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = sum; + if (sgid == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = sum; + if (sgid == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = sum; + barrier(CLK_LOCAL_MEM_FENCE); + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 0 + slid]; + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 1 + slid]; + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 2 + slid]; + + // 1 output per thread in subgroup 0 + if (sgid == 0) { + dst = dst + (offsetd >> 2); + dst[i01 + i20 * ne01] = sum; + } +} diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl new file mode 100644 index 000000000..f128d4434 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl @@ -0,0 +1,156 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable + +#define QK_K 256 +#define K_SCALE_SIZE 12 +#define N_SIMDGROUP 4 +#define SIMDGROUP_WIDTH 64 + +inline void get_scale_min_k4( + int j, + global const uchar * q, + uchar * d, + uchar * m +) { + if (j < 4) { + *d = q[j] & 63; + *m = q[j+4] & 63; + } else { + *d = (q[j+4] & 0x0F) | ((q[j-4] & 0xC0) >> 2); + *m = ((q[j+4] >> 4) & 0x0F) | ((q[j] & 0xC0) >> 2); + } +} + +static inline float8 q5_k_to_fp32_packed8(ushort2 qs5x8, uchar qh5x8, half s, half m) { + float8 fp32x8; + fp32x8.s0 = (float)((( qs5x8.s0 & 0x000F) | (( qh5x8 & 0x01) << 4)) * s + m); + fp32x8.s1 = (float)((((qs5x8.s0 & 0x00F0) >> 4 ) | (((qh5x8 >> 1) & 0x01) << 4)) * s + m); + fp32x8.s2 = (float)((((qs5x8.s0 & 0x0F00) >> 8 ) | (((qh5x8 >> 2) & 0x01) << 4)) * s + m); + fp32x8.s3 = (float)((((qs5x8.s0 & 0xF000) >> 12) | (((qh5x8 >> 3) & 0x01) << 4)) * s + m); + fp32x8.s4 = (float)((( qs5x8.s1 & 0x000F) | (((qh5x8 >> 4) & 0x01) << 4)) * s + m); + fp32x8.s5 = (float)((((qs5x8.s1 & 0x00F0) >> 4 ) | (((qh5x8 >> 5) & 0x01) << 4)) * s + m); + fp32x8.s6 = (float)((((qs5x8.s1 & 0x0F00) >> 8 ) | (((qh5x8 >> 6) & 0x01) << 4)) * s + m); + fp32x8.s7 = (float)((((qs5x8.s1 & 0xF000) >> 12) | (((qh5x8 >> 7) & 0x01) << 4)) * s + m); + return fp32x8; +} + +__attribute__((qcom_reqd_sub_group_size("half"))) +__kernel void kernel_gemv_moe_q5_k_f32_ns( + __global uint * src0_q, + __global uint * src0_qh, + __global half * src0_d, + __global half * src0_dm, + __global uchar * src0_s, + __read_only image1d_buffer_t src1, + __global uint * src2, + __global float * dst, + ulong offsetd, + int ne00, + int ne01, + int ne11 +) { + uint i01 = get_global_id(0); + uint i20 = get_global_id(2); + uint sgid = get_local_id(1); + uint slid = get_sub_group_local_id(); + + uint i11 = i20 % ne11; + + uint expert_id = src2[i20]; + + int num_superblocks = ne00 / QK_K; + int num_subblocks = ne00 / 32; + int scales_per_row = num_superblocks * K_SCALE_SIZE; + + // Expert offsets in the transposed noshuffle layout + uint expert_q_offset = expert_id * (ne00 / 8) * ne01; + uint expert_d_offset = expert_id * num_superblocks * ne01; + + __private float sum = 0.0f; + + // Loop over sub-blocks of 32 elements, N_SIMDGROUP sub-blocks per iter + for (uint ib = sgid; ib < num_subblocks; ib += N_SIMDGROUP) { + uint sb = ib / 8; + uint j = ib % 8; + + // Load d and dmin for this super-block + half d_val = src0_d[expert_d_offset + sb * ne01 + i01]; + half dm_val = src0_dm[expert_d_offset + sb * ne01 + i01]; + + // sub_block index = sb * 8 + j + uint expert_qh_offset = expert_id * num_superblocks * 8 * ne01; + uchar4 regQh = as_uchar4(src0_qh[expert_qh_offset + (sb * 8 + j) * ne01 + i01]); + + // Load sub-block scale and min + global const uchar * sc = src0_s + (expert_id * ne01 + i01) * scales_per_row + sb * K_SCALE_SIZE; + uchar sv, mn; + get_scale_min_k4(j, sc, &sv, &mn); + + float scale = (float)d_val * (float)sv; + float minv = -(float)dm_val * (float)mn; + + // Load 4 uints of quants (32 nibbles = 32 elements) + uint q_base = expert_q_offset + ib * ne01 * 4 + i01; + + uint4 regQ; + regQ.s0 = src0_q[q_base]; + regQ.s1 = src0_q[q_base + ne01]; + regQ.s2 = src0_q[q_base + ne01 * 2]; + regQ.s3 = src0_q[q_base + ne01 * 3]; + + // Load activations: 32 floats = 8 float4s + uint y_offset = i11 * ne00 / 4 + ib * 8; + + float8 fp32x8 = q5_k_to_fp32_packed8(as_ushort2(regQ.s0), regQh.s0, scale, minv); + + float4 shared_y4; + shared_y4 = read_imagef(src1, (y_offset + 0)); + float4 acc = shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 1)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q5_k_to_fp32_packed8(as_ushort2(regQ.s1), regQh.s1, scale, minv); + + shared_y4 = read_imagef(src1, (y_offset + 2)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 3)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q5_k_to_fp32_packed8(as_ushort2(regQ.s2), regQh.s2, scale, minv); + + shared_y4 = read_imagef(src1, (y_offset + 4)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 5)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q5_k_to_fp32_packed8(as_ushort2(regQ.s3), regQh.s3, scale, minv); + + shared_y4 = read_imagef(src1, (y_offset + 6)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 7)); + acc += shared_y4 * fp32x8.hi; + + sum += ((acc.s0 + acc.s1) + (acc.s2 + acc.s3)); + } + + // reduction in local memory, assumes #subgroups=4 + __local float reduceLM[SIMDGROUP_WIDTH * (N_SIMDGROUP - 1)]; + if (sgid == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = sum; + if (sgid == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = sum; + if (sgid == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = sum; + barrier(CLK_LOCAL_MEM_FENCE); + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 0 + slid]; + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 1 + slid]; + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 2 + slid]; + + // 1 output per thread in subgroup 0 + if (sgid == 0) { + dst = dst + (offsetd >> 2); + dst[i01 + i20 * ne01] = sum; + } +} diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl new file mode 100644 index 000000000..526e609dc --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl @@ -0,0 +1,137 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable + +#define QK_K 256 +#define N_SIMDGROUP 4 +#define SIMDGROUP_WIDTH 64 + +static inline float8 q6_k_to_fp32_packed8(ushort2 ql8, ushort qh8, float d_scale) { + float8 fp32x8; + fp32x8.s0 = ((float)(( ql8.s0 & 0x000F) | ((uint)((qh8 ) & 0x3) << 4)) - 32.f) * d_scale; + fp32x8.s1 = ((float)((( ql8.s0 >> 4) & 0x000F) | ((uint)((qh8 >> 2) & 0x3) << 4)) - 32.f) * d_scale; + fp32x8.s2 = ((float)((( ql8.s0 >> 8) & 0x000F) | ((uint)((qh8 >> 4) & 0x3) << 4)) - 32.f) * d_scale; + fp32x8.s3 = ((float)((( ql8.s0 >> 12)& 0x000F) | ((uint)((qh8 >> 6) & 0x3) << 4)) - 32.f) * d_scale; + fp32x8.s4 = ((float)(( ql8.s1 & 0x000F) | ((uint)((qh8 >> 8) & 0x3) << 4)) - 32.f) * d_scale; + fp32x8.s5 = ((float)((( ql8.s1 >> 4) & 0x000F) | ((uint)((qh8 >>10) & 0x3) << 4)) - 32.f) * d_scale; + fp32x8.s6 = ((float)((( ql8.s1 >> 8) & 0x000F) | ((uint)((qh8 >>12) & 0x3) << 4)) - 32.f) * d_scale; + fp32x8.s7 = ((float)((( ql8.s1 >> 12)& 0x000F) | ((uint)((qh8 >>14) & 0x3) << 4)) - 32.f) * d_scale; + return fp32x8; +} + +__attribute__((qcom_reqd_sub_group_size("half"))) +__kernel void kernel_gemv_moe_q6_k_f32_ns( + __global uint * src0_ql, + __global uint * src0_qh, + __global char * src0_s, + __global half * src0_d, + __read_only image1d_buffer_t src1, + __global uint * src2, + __global float * dst, + ulong offsetd, + int ne00, + int ne01, + int ne11 +) { + uint i01 = get_global_id(0); + uint i20 = get_global_id(2); + uint sgid = get_local_id(1); + uint slid = get_sub_group_local_id(); + + uint i11 = i20 % ne11; + + uint expert_id = src2[i20]; + + int num_superblocks = ne00 / QK_K; + int num_subblocks = ne00 / 32; // 8 sub-blocks of 32 per super-block + int scales_per_row = num_superblocks * 16; + + // Expert offsets in the transposed noshuffle layout + uint expert_ql_offset = expert_id * (ne00 / 8) * ne01; // 32 uints per super-block + uint expert_qh_offset = expert_id * (ne00 / 16) * ne01; // 16 uints per super-block + uint expert_d_offset = expert_id * num_superblocks * ne01; + + __private float sum = 0.0f; + + // Loop over sub-blocks of 32 elements, N_SIMDGROUP sub-blocks per iter + for (uint ib = sgid; ib < num_subblocks; ib += N_SIMDGROUP) { + uint sb = ib / 8; // super-block index + uint j = ib % 8; // 32-element group within super-block + + // Load d for this super-block + half d_val = src0_d[expert_d_offset + sb * ne01 + i01]; + + // Load 2 sub-block scales + global const char * sc = src0_s + (expert_id * ne01 + i01) * scales_per_row + sb * 16; + float scale0 = (float)d_val * (float)sc[j * 2]; + float scale1 = (float)d_val * (float)sc[j * 2 + 1]; + + // Load 4 uints of ql + uint ql_base = expert_ql_offset + (ib * 4) * ne01 + i01; + uint4 regQL; + regQL.s0 = src0_ql[ql_base]; + regQL.s1 = src0_ql[ql_base + ne01]; + regQL.s2 = src0_ql[ql_base + ne01 * 2]; + regQL.s3 = src0_ql[ql_base + ne01 * 3]; + + // Load 2 uints of qh + uint qh_base = expert_qh_offset + (ib * 2) * ne01 + i01; + uint2 regQH; + regQH.s0 = src0_qh[qh_base]; + regQH.s1 = src0_qh[qh_base + ne01]; + + // Load activations: 32 floats = 8 float4s + uint y_offset = i11 * ne00 / 4 + ib * 8; + + float8 fp32x8 = q6_k_to_fp32_packed8(as_ushort2(regQL.s0), (ushort)(regQH.s0 & 0xFFFF), scale0); + + float4 shared_y4; + shared_y4 = read_imagef(src1, (y_offset + 0)); + float4 acc = shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 1)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q6_k_to_fp32_packed8(as_ushort2(regQL.s1), (ushort)(regQH.s0 >> 16), scale0); + + shared_y4 = read_imagef(src1, (y_offset + 2)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 3)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q6_k_to_fp32_packed8(as_ushort2(regQL.s2), (ushort)(regQH.s1 & 0xFFFF), scale1); + + shared_y4 = read_imagef(src1, (y_offset + 4)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 5)); + acc += shared_y4 * fp32x8.hi; + + fp32x8 = q6_k_to_fp32_packed8(as_ushort2(regQL.s3), (ushort)(regQH.s1 >> 16), scale1); + + shared_y4 = read_imagef(src1, (y_offset + 6)); + acc += shared_y4 * fp32x8.lo; + + shared_y4 = read_imagef(src1, (y_offset + 7)); + acc += shared_y4 * fp32x8.hi; + + sum += ((acc.s0 + acc.s1) + (acc.s2 + acc.s3)); + } + + // reduction in local memory, assumes #subgroups=4 + __local float reduceLM[SIMDGROUP_WIDTH * (N_SIMDGROUP - 1)]; + if (sgid == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = sum; + if (sgid == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = sum; + if (sgid == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = sum; + barrier(CLK_LOCAL_MEM_FENCE); + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 0 + slid]; + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 1 + slid]; + if (sgid == 0) sum += reduceLM[SIMDGROUP_WIDTH * 2 + slid]; + + // 1 output per thread in subgroup 0 + if (sgid == 0) { + dst = dst + (offsetd >> 2); + dst[i01 + i20 * ne01] = sum; + } +} From b39a7bf1b038e8d1c3d58a313228dfc4f3993f5c Mon Sep 17 00:00:00 2001 From: ravel7524 <58877666+ravel7524@users.noreply.github.com> Date: Wed, 20 May 2026 03:52:21 +0200 Subject: [PATCH 30/33] ggml-cuda: tune RDNA3 Q6_K MMVQ nwarps (#23349) --- ggml/src/ggml-cuda/mmvq.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index da48f313a..73a0991e2 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -359,7 +359,9 @@ static constexpr __host__ __device__ int calc_nwarps(ggml_type type, int ncols_d case GGML_TYPE_Q5_1: case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_K: + return 8; case GGML_TYPE_Q6_K: + return 2; case GGML_TYPE_IQ4_NL: return 8; default: From 871b0b70f81d26494613ad7a9dcb933b1aec4611 Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Tue, 19 May 2026 22:04:04 -0700 Subject: [PATCH 31/33] snapdragon: update toolchain to v0.6 (#23369) * snapdragon: update compiler flags to enable all CPU features * snapdragon: update readme to point to toolchain v0.6 * snapdragon: bump toolchain docker to v0.6 --- .github/workflows/build-and-test-snapdragon.yml | 4 ++-- docs/backend/snapdragon/CMakeUserPresets.json | 8 ++++---- docs/backend/snapdragon/README.md | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build-and-test-snapdragon.yml b/.github/workflows/build-and-test-snapdragon.yml index ef3fe502f..84613b4c8 100644 --- a/.github/workflows/build-and-test-snapdragon.yml +++ b/.github/workflows/build-and-test-snapdragon.yml @@ -31,7 +31,7 @@ jobs: android-ndk-snapdragon: runs-on: ubuntu-latest container: - image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3' + image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.6' defaults: run: shell: bash @@ -61,7 +61,7 @@ jobs: linux-iot-snapdragon: runs-on: ubuntu-latest container: - image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.1' + image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.6' defaults: run: shell: bash diff --git a/docs/backend/snapdragon/CMakeUserPresets.json b/docs/backend/snapdragon/CMakeUserPresets.json index c07bf5ca0..d2629fc4d 100644 --- a/docs/backend/snapdragon/CMakeUserPresets.json +++ b/docs/backend/snapdragon/CMakeUserPresets.json @@ -10,8 +10,8 @@ "ANDROID_ABI": "arm64-v8a", "ANDROID_PLATFORM": "android-31", "CMAKE_TOOLCHAIN_FILE": "$env{ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake", - "CMAKE_C_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE", - "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE", + "CMAKE_C_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE", + "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE", "CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG", "CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG", "CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g", @@ -59,8 +59,8 @@ "toolset": { "value": "host=x86_64", "strategy": "external" }, "cacheVariables": { "CMAKE_TOOLCHAIN_FILE": "cmake/arm64-linux-clang.cmake", - "CMAKE_C_FLAGS": "-march=armv8 -fno-finite-math-only -flto -D_GNU_SOURCE", - "CMAKE_CXX_FLAGS": "-march=armv8 -fno-finite-math-only -flto -D_GNU_SOURCE", + "CMAKE_C_FLAGS": "-march=armv8.2a+fp16+dotprod -fvectorize -fno-finite-math-only -flto -D_GNU_SOURCE", + "CMAKE_CXX_FLAGS": "-march=armv8.2a+fp16+dotprod -fvectorize -fno-finite-math-only -flto -D_GNU_SOURCE", "CMAKE_C_FLAGS_RELEASE": "-O3 -DNDEBUG", "CMAKE_CXX_FLAGS_RELEASE": "-O3 -DNDEBUG", "CMAKE_C_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g", diff --git a/docs/backend/snapdragon/README.md b/docs/backend/snapdragon/README.md index 2414eeaf6..f5bb3d11c 100644 --- a/docs/backend/snapdragon/README.md +++ b/docs/backend/snapdragon/README.md @@ -10,7 +10,7 @@ This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc. This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop. ``` -~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.3 +~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.6 [d]/> cd /workspace ``` From 57ebaf4edd99ea675f256ae2286cd99206dbfcd1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 20 May 2026 09:42:00 +0300 Subject: [PATCH 32/33] metal : optimize pad + cpy (#23354) * metal : optimize pad * metal : optinmize cpy * cont : better row packing in threadgroup --- ggml/src/ggml-metal/ggml-metal-device.cpp | 8 +- ggml/src/ggml-metal/ggml-metal-ops.cpp | 17 ++- ggml/src/ggml-metal/ggml-metal.metal | 128 ++++++++++++---------- src/models/delta-net-base.cpp | 8 +- 4 files changed, 94 insertions(+), 67 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp index e288a27f9..ba006d9b3 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.cpp +++ b/ggml/src/ggml-metal/ggml-metal-device.cpp @@ -1897,7 +1897,11 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad(ggml_metal_l char base[256]; char name[256]; - snprintf(base, 256, "kernel_pad_%s", ggml_type_name(op->src[0]->type)); + // note: this is slower + //const bool is_c4 = op->src[0]->ne[0] % 4 == 0 && op->ne[0] % 4 == 0; + const bool is_c4 = false; + + snprintf(base, 256, "kernel_pad_%s%s", ggml_type_name(op->src[0]->type), is_c4 ? "_4" : ""); snprintf(name, 256, "%s", base); ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); @@ -1907,6 +1911,8 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad(ggml_metal_l res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr); + res.c4 = is_c4; + return res; } diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index a114391c2..8506000b6 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -816,9 +816,7 @@ int ggml_metal_op_unary(ggml_metal_op_t ctx, int idx) { ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1); } else { const int nth_max = MIN(256, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)); - const int nth = MIN(args.ne00, nth_max); - const int nk0 = (args.ne00 + nth - 1)/nth; ggml_metal_encoder_dispatch_threadgroups(enc, nk0*ne01, ne02, ne03, nth, 1, 1); @@ -1863,7 +1861,7 @@ int ggml_metal_op_cpy(ggml_metal_op_t ctx, int idx) { nk0 = ne00/ggml_blck_size(op->type); } - int nth = std::min(nk0, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)); + int nth = std::min(nk0*ne01, 256); // when rows are small, we can batch them together in a single threadgroup int nrptg = 1; @@ -1874,7 +1872,7 @@ int ggml_metal_op_cpy(ggml_metal_op_t ctx, int idx) { nrptg = (nth + nk0 - 1)/nk0; nth = nk0; - if (nrptg*nth > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) { + if (nrptg*nth > 256) { nrptg--; } } @@ -4039,14 +4037,21 @@ int ggml_metal_op_pad(ggml_metal_op_t ctx, int idx) { auto pipeline = ggml_metal_library_get_pipeline_pad(lib, op); - const int nth = std::min(1024, ne0); + if (pipeline.c4) { + args.ne00 = ne00/4; + args.ne0 = ne0/4; + } + + const int nth_max = MIN(64, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)); + const int nth = MIN(args.ne0, nth_max); + const int nk0 = (args.ne0 + 1024 - 1)/1024; // note: 1024 is hardcoded in the kernel! ggml_metal_encoder_set_pipeline(enc, pipeline); ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0); ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1); ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2); - ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1); + ggml_metal_encoder_dispatch_threadgroups(enc, nk0*ne1, ne2, ne3, nth, 1, 1); return 1; } diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index f6ffb2b3a..4cf9dbea9 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -2643,7 +2643,7 @@ kernel void kernel_gated_delta_net_impl( b_ptr += args.ne21; g_ptr += args.ne21*G; - if (K > 1u) { + if (K > 1) { const int target_slot = (int)t - shift; if (target_slot >= 0 && target_slot < (int)K) { device float * dst_state = (device float *) (dst) + attn_size + (uint)target_slot * state_size_per_snap + state_out_base; @@ -2655,7 +2655,7 @@ kernel void kernel_gated_delta_net_impl( } } - if (K == 1u) { + if (K == 1) { device float * dst_state = (device float *) (dst) + attn_size + state_out_base; FOR_UNROLL (short j = 0; j < NSG; j++) { const short is = tx*NSG + j; @@ -5104,7 +5104,7 @@ kernel void kernel_upscale_bilinear_f32( for (int64_t sx = x_min; sx < x_max; ++sx) { const float wx = MAX(0.0f, 1.0f - fabs((float)sx - f00) * invscale0); const float w = wx * wy; - const device const float * src_ptr = (device const float *)(src0 + sy*args.nb01 + sx*args.nb00); + device const float * src_ptr = (device const float *)(src0 + sy*args.nb01 + sx*args.nb00); sum += (*src_ptr) * w; wsum += w; } @@ -5286,7 +5286,7 @@ kernel void kernel_upscale_bicubic_f32( const int64_t ix = MAX(0, MIN(args.ne00 - 1, i00 + dx)); const float wx = (dx == -1) ? w_x0 : (dx == 0) ? w_x1 : (dx == 1) ? w_x2 : w_x3; - const device const float * src_ptr = (device const float *)(src_slice + iy * args.nb01 + ix * args.nb00); + device const float * src_ptr = (device const float *)(src_slice + iy * args.nb01 + ix * args.nb00); sum += (*src_ptr) * wx * wy; } } @@ -5329,42 +5329,46 @@ kernel void kernel_roll_f32( } } -kernel void kernel_pad_f32( +template +kernel void kernel_pad_impl( constant ggml_metal_kargs_pad & args, device const char * src0, device char * dst, uint3 tgpig[[threadgroup_position_in_grid]], uint3 tpitg[[thread_position_in_threadgroup]], uint3 ntg[[threads_per_threadgroup]]) { + const int32_t i3 = tgpig.z; + const int32_t i2 = tgpig.y; + const int32_t k0 = tgpig.x/args.ne1; + const int32_t i1 = tgpig.x - k0*args.ne1; - const int64_t i3 = tgpig.z; - const int64_t i2 = tgpig.y; - const int64_t i1 = tgpig.x; + const int32_t i03 = i3; + const int32_t i02 = i2; + const int32_t i01 = i1; - const int64_t i03 = i3; - const int64_t i02 = i2; - const int64_t i01 = i1; + device const T * src0_ptr = (device const T *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01); + device T * dst_ptr = (device T *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1); - device const float * src0_ptr = (device const float *) (src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01); - device float * dst_ptr = (device float *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1); - - if (i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) { - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - if (i0 < args.ne00) { - dst_ptr[i0] = src0_ptr[i0]; - } else { - dst_ptr[i0] = 0.0f; - } + for (int32_t l0 = 0; l0 < 1024; l0 += ntg.x) { + const int32_t i0 = k0*1024 + tpitg.x + l0; + if (i0 >= args.ne0) { + break; } - return; - } - - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { - dst_ptr[i0] = 0.0f; + if (i0 < args.ne00 && i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) { + dst_ptr[i0] = src0_ptr[i0]; + } else { + dst_ptr[i0] = 0.0f; + } } } +typedef decltype(kernel_pad_impl) kernel_pad_t; + +template [[host_name("kernel_pad_f32")]] kernel kernel_pad_t kernel_pad_impl; +template [[host_name("kernel_pad_f32_4")]] kernel kernel_pad_t kernel_pad_impl; + +// TODO: this is slow - optimize kernel void kernel_pad_reflect_1d_f32( constant ggml_metal_kargs_pad_reflect_1d & args, device const char * src0, @@ -7328,23 +7332,27 @@ kernel void kernel_cpy_t_t( device const char * src0, device char * dst, uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiitg[[thread_index_in_threadgroup]], + ushort3 tpitg[[thread_position_in_threadgroup]], ushort3 ntg[[threads_per_threadgroup]]) { - const int i03 = tgpig[2]; - const int i02 = tgpig[1]; - const int i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tiitg/ntg[0]; - const int iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + const int32_t i03 = tgpig[2]; + const int32_t i02 = tgpig[1]; + const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y; + const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + + if (i01 >= args.ne01) { + return; + } const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00; - const int64_t i3 = n/(args.ne2*args.ne1*args.ne0); - const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0); - const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0; - const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0); + const int32_t i3 = n/(args.ne2*args.ne1*args.ne0); + const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0); + const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0; + const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0); device T1 * dst_data = (device T1 *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); - for (int64_t i00 = iw0*ntg[0] + tiitg%ntg[0]; i00 < args.ne00; ) { + for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.ne00;) { device const T0 * src = (device T0 *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00); dst_data[i00] = (T1) src[0]; break; @@ -7376,23 +7384,27 @@ kernel void kernel_cpy_f32_q( device const char * src0, device char * dst, uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiitg[[thread_index_in_threadgroup]], + ushort3 tpitg[[thread_position_in_threadgroup]], ushort3 ntg[[threads_per_threadgroup]]) { - const int i03 = tgpig[2]; - const int i02 = tgpig[1]; - const int i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tiitg/ntg[0]; - const int iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + const int32_t i03 = tgpig[2]; + const int32_t i02 = tgpig[1]; + const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y; + const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + + if (i01 >= args.ne01) { + return; + } const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00; - const int64_t i3 = n / (args.ne2*args.ne1*args.ne0); - const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0); - const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0; - const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK; + const int32_t i3 = n / (args.ne2*args.ne1*args.ne0); + const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0) / (args.ne1*args.ne0); + const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0) / args.ne0; + const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0)/QK; device block_q * dst_data = (device block_q *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); - for (int64_t i00 = iw0*ntg[0] + tiitg%ntg[0]; i00 < args.nk0; ) { + for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.nk0;) { device const float * src = (device const float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + (i00*QK)*args.nb00); quantize_func(src, dst_data[i00]); @@ -7417,24 +7429,28 @@ kernel void kernel_cpy_q_f32( device const char * src0, device char * dst, uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiitg[[thread_index_in_threadgroup]], + ushort3 tpitg[[thread_position_in_threadgroup]], ushort3 ntg[[threads_per_threadgroup]]) { - const int i03 = tgpig[2]; - const int i02 = tgpig[1]; - const int i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tiitg/ntg[0]; - const int iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + const int32_t i03 = tgpig[2]; + const int32_t i02 = tgpig[1]; + const int32_t i01 = ntg[1] == 1 ? tgpig[0]%args.ne01 : tgpig[0]*ntg[1] + tpitg.y; + const int32_t iw0 = ntg[1] == 1 ? tgpig[0]/args.ne01 : 0; + + if (i01 >= args.ne01) { + return; + } const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00; - const int64_t i3 = n/(args.ne2*args.ne1*args.ne0); - const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0); - const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0; - const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0); + const int32_t i3 = n/(args.ne2*args.ne1*args.ne0); + const int32_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0); + const int32_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0; + const int32_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0); device const block_q * src_data = (device const block_q *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01); device T4x4 * dst_data = (device T4x4 *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); - for (int64_t i00 = iw0*ntg[0] + tiitg%ntg[0]; i00 < args.nk0; ) { + for (int32_t i00 = iw0*ntg[0] + tpitg.x; i00 < args.nk0;) { T4x4 temp; dequantize_func(src_data + i00/nl, i00%nl, temp); dst_data[i00] = temp; diff --git a/src/models/delta-net-base.cpp b/src/models/delta-net-base.cpp index a67238383..4f4c7cac7 100644 --- a/src/models/delta-net-base.cpp +++ b/src/models/delta-net-base.cpp @@ -562,13 +562,13 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn( } const int64_t D = S_v * S_v * H_v; - const int64_t K = (int64_t) cparams.n_rs_seq + 1; + const int64_t K = cparams.n_rs_seq + 1; // TODO: remove pad + simplify - ggml_tensor * state_in_3d = ggml_reshape_3d(ctx0, s, D, 1, n_seqs); - ggml_tensor * state_3d = ggml_pad(ctx0, state_in_3d, 0, K - 1, 0, 0); + ggml_tensor * s_3d = ggml_reshape_3d(ctx0, s, D, 1, n_seqs); + ggml_tensor * s_3d_pad = ggml_pad (ctx0, s_3d, 0, K - 1, 0, 0); - ggml_tensor * gdn_out = ggml_gated_delta_net(ctx0, q, k, v, g, b, state_3d); + ggml_tensor * gdn_out = ggml_gated_delta_net(ctx0, q, k, v, g, b, s_3d_pad); if (n_seq_tokens > 1) { cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_CH, il); } else { From 585080d3103fe1091987a4576780991ff88f802e Mon Sep 17 00:00:00 2001 From: Aleksander Grygier Date: Wed, 20 May 2026 09:46:31 +0200 Subject: [PATCH 33/33] fix: Div wrapper no pointer events on hidden (#23390) --- .../app/chat/ChatScreen/ChatScreenActionScrollDown.svelte | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte index bbfed95e9..c43bee3e3 100644 --- a/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte @@ -41,12 +41,16 @@ }); -
+