Merge branch 'upstream' into concedo_experimental

# Conflicts: # .github/workflows/build-linux-cross.yml # .github/workflows/build.yml # CODEOWNERS # ggml/CMakeLists.txt # ggml/src/ggml-cuda/fattn.cu # ggml/src/ggml-webgpu/CMakeLists.txt # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl # tests/test-backend-ops.cpp # tests/test-chat-template.cpp # tools/llama-bench/llama-bench.cpp # tools/rpc/README.md # tools/server/README.md
2026-05-21 18:52:02 +00:00 · 2025-10-09 01:33:27 +08:00 · 2025-10-09 01:33:27 +08:00 · b6f6338bba
commit b6f6338bba
parent 224800b33b d2ee056e1d
32 changed files with 1556 additions and 636 deletions
--- a/tools/server/public/index.html.gz
+++ b/tools/server/public/index.html.gz
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@ -1937,7 +1937,7 @@ private:
    void cleanup_pending_task(int id_target) {
        // no need lock because this is called exclusively by post()
        auto rm_func = [id_target](const server_task & task) {
-            return task.id_target == id_target;
+            return task.id == id_target;
        };
        queue_tasks.erase(
            std::remove_if(queue_tasks.begin(),          queue_tasks.end(),          rm_func),
@ -3676,6 +3676,20 @@ struct server_context {
                        alora_disabled_id = enabled_loras[0];
                    }

+                    bool do_checkpoint = params_base.n_ctx_checkpoints > 0;
+
+                    // make a checkpoint of the parts of the memory that cannot be rolled back.
+                    // checkpoints are created only if:
+                    // - the model uses SWA and we are not using `swa_full`
+                    // - the model architecture is marked as recurrent or hybrid
+                    //
+                    // TODO: try to make this conditional on the context or the memory module, instead of the model type
+                    do_checkpoint = do_checkpoint && (
+                            llama_model_is_recurrent(model) ||
+                            llama_model_is_hybrid(model) ||
+                            (llama_model_n_swa(model) > 0 && !params_base.swa_full)
+                            );
+
                    // add prompt tokens for processing in the current batch
                    while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
                        // get next token to process
@ -3700,6 +3714,11 @@ struct server_context {

                        slot.n_prompt_tokens_processed++;
                        slot.n_past++;
+
+                        // process the last few tokens of the prompt separately in order to allow for a checkpoint to be created.
+                        if (do_checkpoint && slot.n_prompt_tokens - slot.n_past == 64) {
+                            break;
+                        }
                    }

                    // SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());
@ -3730,6 +3749,39 @@ struct server_context {
                        slot.i_batch   = batch.n_tokens - 1;

                        SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.n_tokens);
+
+                        const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
+                        const auto pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx), slot.id);
+
+                        // no need for empty or small checkpoints
+                        do_checkpoint = do_checkpoint && (pos_min >= 0 && pos_max >= 64);
+
+                        // no need to create checkpoints that are too close together
+                        do_checkpoint = do_checkpoint && (slot.ctx_checkpoints.empty() || pos_max > slot.ctx_checkpoints.back().pos_max + 64);
+
+                        if (do_checkpoint) {
+                            while (slot.ctx_checkpoints.size() >= (size_t) params_base.n_ctx_checkpoints) {
+                                // make room for the new checkpoint, if needed
+                                const auto & cur = slot.ctx_checkpoints.front();
+                                SLT_WRN(slot, "erasing old context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n",
+                                        cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024);
+
+                                slot.ctx_checkpoints.erase(slot.ctx_checkpoints.begin());
+                            }
+
+                            const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+
+                            auto & cur = slot.ctx_checkpoints.emplace_back(ctx_checkpoint{
+                                /*.pos_min = */ pos_min,
+                                /*.pos_max = */ pos_max,
+                                /*.data    = */ std::vector<uint8_t>(checkpoint_size),
+                            });
+
+                            llama_state_seq_get_data_ext(ctx, cur.data.data(), checkpoint_size, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+
+                            SLT_WRN(slot, "saved context checkpoint %d of %d (pos_min = %d, pos_max = %d, size = %.3f MiB)\n",
+                                    (int) slot.ctx_checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024);
+                        }
                    }
                }

@ -3853,40 +3905,6 @@ struct server_context {

                    // prompt evaluated for next-token prediction
                    slot.state = SLOT_STATE_GENERATING;
-
-                    // make a checkpoint of the parts of the memory that cannot be rolled back.
-                    // checkpoints are created only if:
-                    // - the model uses SWA and we are not using `swa_full`
-                    // - the model architecture is marked as recurrent or hybrid
-                    //
-                    // TODO: try to make this conditional on the context or the memory module, instead of the model type
-                    const bool do_checkpoint =
-                        (llama_model_is_recurrent(model) || llama_model_is_hybrid(model)) ||
-                        (llama_model_n_swa(model) > 0 && !params_base.swa_full);
-
-                    if (do_checkpoint && params_base.n_ctx_checkpoints > 0) {
-                        while (slot.ctx_checkpoints.size() >= (size_t) params_base.n_ctx_checkpoints) {
-                            // make room for the new checkpoint, if needed
-                            const auto & cur = slot.ctx_checkpoints.front();
-                            SLT_WRN(slot, "erasing old context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n",
-                                    cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024);
-
-                            slot.ctx_checkpoints.erase(slot.ctx_checkpoints.begin());
-                        }
-
-                        const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
-
-                        auto & cur = slot.ctx_checkpoints.emplace_back(ctx_checkpoint{
-                            /*.pos_min = */ llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id),
-                            /*.pos_max = */ llama_memory_seq_pos_max(llama_get_memory(ctx), slot.id),
-                            /*.data    = */ std::vector<uint8_t>(checkpoint_size),
-                        });
-
-                        llama_state_seq_get_data_ext(ctx, cur.data.data(), checkpoint_size, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
-
-                        SLT_WRN(slot, "saved context checkpoint %d of %d (pos_min = %d, pos_max = %d, size = %.3f MiB)\n",
-                                (int) slot.ctx_checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min, cur.pos_max, (float) cur.data.size() / 1024 / 1024);
-                    }
                } else if (slot.state != SLOT_STATE_GENERATING) {
                    continue; // continue loop of slots
                }
@ -4184,6 +4202,7 @@ int main(int argc, char ** argv) {
    auto middleware_validate_api_key = [&params, &res_error](const httplib::Request & req, httplib::Response & res) {
        static const std::unordered_set<std::string> public_endpoints = {
            "/health",
+            "/v1/health",
            "/models",
            "/v1/models",
            "/api/tags"
@ -5232,6 +5251,7 @@ int main(int argc, char ** argv) {

    // register API routes
    svr->Get (params.api_prefix + "/health",              handle_health); // public endpoint (no API key check)
+    svr->Get (params.api_prefix + "/v1/health",           handle_health); // public endpoint (no API key check)
    svr->Get (params.api_prefix + "/metrics",             handle_metrics);
    svr->Get (params.api_prefix + "/props",               handle_props);
    svr->Post(params.api_prefix + "/props",               handle_props_change);
--- a/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarActions.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarActions.svelte
@ -1,8 +1,9 @@
 <script lang="ts">
-	import { Search, SquarePen, X } from '@lucide/svelte';
+	import { Search, SquarePen, X, Download, Upload } from '@lucide/svelte';
 	import { KeyboardShortcutInfo } from '$lib/components/app';
 	import { Button } from '$lib/components/ui/button';
 	import { Input } from '$lib/components/ui/input';
+	import { exportAllConversations, importConversations } from '$lib/stores/chat.svelte';

 	interface Props {
 		handleMobileSidebarItemClick: () => void;
@ -77,5 +78,34 @@

 			<KeyboardShortcutInfo keys={['cmd', 'k']} />
 		</Button>
+
+		<Button
+			class="w-full justify-start text-sm"
+			onclick={() => {
+				importConversations().catch((err) => {
+					console.error('Import failed:', err);
+					// Optional: show toast or dialog
+				});
+			}}
+			variant="ghost"
+		>
+			<div class="flex items-center gap-2">
+				<Upload class="h-4 w-4" />
+				Import conversations
+			</div>
+		</Button>
+
+		<Button
+			class="w-full justify-start text-sm"
+			onclick={() => {
+				exportAllConversations();
+			}}
+			variant="ghost"
+		>
+			<div class="flex items-center gap-2">
+				<Download class="h-4 w-4" />
+				Export all conversations
+			</div>
+		</Button>
 	{/if}
 </div>
--- a/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarConversationItem.svelte
+++ b/tools/server/webui/src/lib/components/app/chat/ChatSidebar/ChatSidebarConversationItem.svelte
@ -1,6 +1,7 @@
 <script lang="ts">
-	import { Trash2, Pencil, MoreHorizontal } from '@lucide/svelte';
+	import { Trash2, Pencil, MoreHorizontal, Download } from '@lucide/svelte';
 	import { ActionDropdown } from '$lib/components/app';
+	import { downloadConversation } from '$lib/stores/chat.svelte';
 	import { onMount } from 'svelte';

 	interface Props {
@ -101,6 +102,15 @@
 						onclick: handleEdit,
 						shortcut: ['shift', 'cmd', 'e']
 					},
+					{
+						icon: Download,
+						label: 'Export',
+						onclick: (e) => {
+							e.stopPropagation();
+							downloadConversation(conversation.id);
+						},
+						shortcut: ['shift', 'cmd', 's']
+					},
 					{
 						icon: Trash2,
 						label: 'Delete',
--- a/tools/server/webui/src/lib/stores/chat.svelte.ts
+++ b/tools/server/webui/src/lib/stores/chat.svelte.ts
@ -6,6 +6,8 @@ import { filterByLeafNodeId, findLeafNode, findDescendantMessages } from '$lib/u
 import { browser } from '$app/environment';
 import { goto } from '$app/navigation';
 import { extractPartialThinking } from '$lib/utils/thinking';
+import { toast } from 'svelte-sonner';
+import type { ExportedConversations } from '$lib/types/database';

 /**
 * ChatStore - Central state management for chat conversations and AI interactions
@ -951,6 +953,166 @@ class ChatStore {
 		}
 	}

+	/**
+	 * Downloads a conversation as JSON file
+	 * @param convId - The conversation ID to download
+	 */
+	async downloadConversation(convId: string): Promise<void> {
+		if (!this.activeConversation || this.activeConversation.id !== convId) {
+			// Load the conversation if not currently active
+			const conversation = await DatabaseStore.getConversation(convId);
+			if (!conversation) return;
+
+			const messages = await DatabaseStore.getConversationMessages(convId);
+			const conversationData = {
+				conv: conversation,
+				messages
+			};
+
+			this.triggerDownload(conversationData);
+		} else {
+			// Use current active conversation data
+			const conversationData: ExportedConversations = {
+				conv: this.activeConversation!,
+				messages: this.activeMessages
+			};
+
+			this.triggerDownload(conversationData);
+		}
+	}
+
+	/**
+	 * Triggers file download in browser
+	 * @param data - Data to download (expected: { conv: DatabaseConversation, messages: DatabaseMessage[] })
+	 * @param filename - Optional filename
+	 */
+	private triggerDownload(data: ExportedConversations, filename?: string): void {
+		const conversation =
+			'conv' in data ? data.conv : Array.isArray(data) ? data[0]?.conv : undefined;
+		if (!conversation) {
+			console.error('Invalid data: missing conversation');
+			return;
+		}
+		const conversationName = conversation.name ? conversation.name.trim() : '';
+		const convId = conversation.id || 'unknown';
+		const truncatedSuffix = conversationName
+			.toLowerCase()
+			.replace(/[^a-z0-9]/gi, '_')
+			.replace(/_+/g, '_')
+			.substring(0, 20);
+		const downloadFilename = filename || `conversation_${convId}_${truncatedSuffix}.json`;
+
+		const conversationJson = JSON.stringify(data, null, 2);
+		const blob = new Blob([conversationJson], {
+			type: 'application/json'
+		});
+		const url = URL.createObjectURL(blob);
+		const a = document.createElement('a');
+		a.href = url;
+		a.download = downloadFilename;
+		document.body.appendChild(a);
+		a.click();
+		document.body.removeChild(a);
+		URL.revokeObjectURL(url);
+	}
+
+	/**
+	 * Exports all conversations with their messages as a JSON file
+	 */
+	async exportAllConversations(): Promise<void> {
+		try {
+			const allConversations = await DatabaseStore.getAllConversations();
+			if (allConversations.length === 0) {
+				throw new Error('No conversations to export');
+			}
+
+			const allData: ExportedConversations = await Promise.all(
+				allConversations.map(async (conv) => {
+					const messages = await DatabaseStore.getConversationMessages(conv.id);
+					return { conv, messages };
+				})
+			);
+
+			const blob = new Blob([JSON.stringify(allData, null, 2)], {
+				type: 'application/json'
+			});
+			const url = URL.createObjectURL(blob);
+			const a = document.createElement('a');
+			a.href = url;
+			a.download = `all_conversations_${new Date().toISOString().split('T')[0]}.json`;
+			document.body.appendChild(a);
+			a.click();
+			document.body.removeChild(a);
+			URL.revokeObjectURL(url);
+
+			toast.success(`All conversations (${allConversations.length}) prepared for download`);
+		} catch (err) {
+			console.error('Failed to export conversations:', err);
+			throw err;
+		}
+	}
+
+	/**
+	 * Imports conversations from a JSON file.
+	 * Supports both single conversation (object) and multiple conversations (array).
+	 * Uses DatabaseStore for safe, encapsulated data access
+	 */
+	async importConversations(): Promise<void> {
+		return new Promise((resolve, reject) => {
+			const input = document.createElement('input');
+			input.type = 'file';
+			input.accept = '.json';
+
+			input.onchange = async (e) => {
+				const file = (e.target as HTMLInputElement)?.files?.[0];
+				if (!file) {
+					reject(new Error('No file selected'));
+					return;
+				}
+
+				try {
+					const text = await file.text();
+					const parsedData = JSON.parse(text);
+					let importedData: ExportedConversations;
+
+					if (Array.isArray(parsedData)) {
+						importedData = parsedData;
+					} else if (
+						parsedData &&
+						typeof parsedData === 'object' &&
+						'conv' in parsedData &&
+						'messages' in parsedData
+					) {
+						// Single conversation object
+						importedData = [parsedData];
+					} else {
+						throw new Error(
+							'Invalid file format: expected array of conversations or single conversation object'
+						);
+					}
+
+					const result = await DatabaseStore.importConversations(importedData);
+
+					// Refresh UI
+					await this.loadConversations();
+
+					toast.success(`Imported ${result.imported} conversation(s), skipped ${result.skipped}`);
+
+					resolve(undefined);
+				} catch (err: unknown) {
+					const message = err instanceof Error ? err.message : 'Unknown error';
+					console.error('Failed to import conversations:', err);
+					toast.error('Import failed', {
+						description: message
+					});
+					reject(new Error(`Import failed: ${message}`));
+				}
+			};
+
+			input.click();
+		});
+	}
+
 	/**
 	 * Deletes a conversation and all its messages
 	 * @param convId - The conversation ID to delete
@ -1427,6 +1589,9 @@ export const isInitialized = () => chatStore.isInitialized;
 export const maxContextError = () => chatStore.maxContextError;

 export const createConversation = chatStore.createConversation.bind(chatStore);
+export const downloadConversation = chatStore.downloadConversation.bind(chatStore);
+export const exportAllConversations = chatStore.exportAllConversations.bind(chatStore);
+export const importConversations = chatStore.importConversations.bind(chatStore);
 export const deleteConversation = chatStore.deleteConversation.bind(chatStore);
 export const sendMessage = chatStore.sendMessage.bind(chatStore);
 export const gracefulStop = chatStore.gracefulStop.bind(chatStore);
--- a/tools/server/webui/src/lib/stores/database.ts
+++ b/tools/server/webui/src/lib/stores/database.ts
@ -346,4 +346,39 @@ export class DatabaseStore {
 	): Promise<void> {
 		await db.messages.update(id, updates);
 	}
+
+	/**
+	 * Imports multiple conversations and their messages.
+	 * Skips conversations that already exist.
+	 *
+	 * @param data - Array of { conv, messages } objects
+	 */
+	static async importConversations(
+		data: { conv: DatabaseConversation; messages: DatabaseMessage[] }[]
+	): Promise<{ imported: number; skipped: number }> {
+		let importedCount = 0;
+		let skippedCount = 0;
+
+		return await db.transaction('rw', [db.conversations, db.messages], async () => {
+			for (const item of data) {
+				const { conv, messages } = item;
+
+				const existing = await db.conversations.get(conv.id);
+				if (existing) {
+					console.warn(`Conversation "${conv.name}" already exists, skipping...`);
+					skippedCount++;
+					continue;
+				}
+
+				await db.conversations.add(conv);
+				for (const msg of messages) {
+					await db.messages.put(msg);
+				}
+
+				importedCount++;
+			}
+
+			return { imported: importedCount, skipped: skippedCount };
+		});
+	}
 }
--- a/tools/server/webui/src/lib/types/database.d.ts
+++ b/tools/server/webui/src/lib/types/database.d.ts
@ -54,3 +54,18 @@ export interface DatabaseMessage {
 	timings?: ChatMessageTimings;
 	model?: string;
 }
+
+/**
+ * Represents a single conversation with its associated messages,
+ * typically used for import/export operations.
+ */
+export type ExportedConversation = {
+	conv: DatabaseConversation;
+	messages: DatabaseMessage[];
+};
+
+/**
+ * Type representing one or more exported conversations.
+ * Can be a single conversation object or an array of them.
+ */
+export type ExportedConversations = ExportedConversation | ExportedConversation[];