mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-13 02:19:41 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/full-cuda.Dockerfile # .devops/full-rocm.Dockerfile # .devops/full.Dockerfile # .devops/llama-cli-cuda.Dockerfile # .devops/llama-cli-intel.Dockerfile # .devops/llama-cli-rocm.Dockerfile # .devops/llama-cli-vulkan.Dockerfile # .devops/llama-cli.Dockerfile # .devops/llama-server-cuda.Dockerfile # .devops/llama-server-intel.Dockerfile # .devops/llama-server-rocm.Dockerfile # .devops/llama-server-vulkan.Dockerfile # .devops/llama-server.Dockerfile # CMakeLists.txt # CONTRIBUTING.md # Makefile # ggml/CMakeLists.txt # ggml/src/CMakeLists.txt # requirements.txt # src/llama.cpp # tests/test-backend-ops.cpp
This commit is contained in:
commit
24b9616344
61 changed files with 12994 additions and 936 deletions
|
@ -211,7 +211,7 @@ retrieval:
|
|||
|
||||
--context-file FNAME file to load context from (repeat to specify multiple files)
|
||||
--chunk-size N minimum length of embedded text chunks (default: 64)
|
||||
--chunk-separator STRING
|
||||
--chunk-separator STRING
|
||||
separator between chunks (default: '
|
||||
')
|
||||
|
||||
|
@ -256,7 +256,7 @@ server:
|
|||
--threads-http N number of threads used to process HTTP requests (default: -1)
|
||||
--system-prompt-file FNAME
|
||||
set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications
|
||||
--log-format {text,json}
|
||||
--log-format {text,json}
|
||||
log output format: json or text (default: json)
|
||||
--metrics enable prometheus compatible metrics endpoint (default: disabled)
|
||||
--no-slots disables slots monitoring endpoint (default: enabled)
|
||||
|
|
|
@ -21,7 +21,7 @@ let generation_settings = null;
|
|||
//
|
||||
export async function* llama(prompt, params = {}, config = {}) {
|
||||
let controller = config.controller;
|
||||
const api_url = config.api_url || "";
|
||||
const api_url = config.api_url?.replace(/\/+$/, '') || "";
|
||||
|
||||
if (!controller) {
|
||||
controller = new AbortController();
|
||||
|
@ -196,7 +196,7 @@ export const llamaComplete = async (params, controller, callback) => {
|
|||
// Get the model info from the server. This is useful for getting the context window and so on.
|
||||
export const llamaModelInfo = async (config = {}) => {
|
||||
if (!generation_settings) {
|
||||
const api_url = config.api_url || "";
|
||||
const api_url = config.api_url?.replace(/\/+$/, '') || "";
|
||||
const props = await fetch(`${api_url}/props`).then(r => r.json());
|
||||
generation_settings = props.default_generation_settings;
|
||||
}
|
||||
|
|
|
@ -14,10 +14,10 @@
|
|||
<script type="module">
|
||||
import {
|
||||
html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
|
||||
} from '/index.js';
|
||||
} from './index.js';
|
||||
|
||||
import { llama } from '/completion.js';
|
||||
import { SchemaConverter } from '/json-schema-to-grammar.mjs';
|
||||
import { llama } from './completion.js';
|
||||
import { SchemaConverter } from './json-schema-to-grammar.mjs';
|
||||
import { promptFormats } from './prompt-formats.js';
|
||||
import { systemPrompts } from './system-prompts.js'; // multilingual is wip
|
||||
let selected_image = false;
|
||||
|
@ -225,7 +225,7 @@
|
|||
throw new Error("already running");
|
||||
}
|
||||
controller.value = new AbortController();
|
||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
|
||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: URL.parse('.', document.baseURI).href })) {
|
||||
const data = chunk.data;
|
||||
if (data.stop) {
|
||||
while (
|
||||
|
|
|
@ -479,7 +479,7 @@
|
|||
throw new Error("already running");
|
||||
}
|
||||
controller.value = new AbortController();
|
||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: location.pathname.replace(/\/+$/, '') })) {
|
||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: URL.parse('.', document.baseURI).href })) {
|
||||
const data = chunk.data;
|
||||
|
||||
if (data.stop) {
|
||||
|
|
|
@ -1183,7 +1183,7 @@ struct server_context {
|
|||
|
||||
bool process_token(completion_token_output & result, server_slot & slot) {
|
||||
// remember which tokens were sampled - used for repetition penalties during sampling
|
||||
const std::string token_str = llama_token_to_piece(ctx, result.tok, false);
|
||||
const std::string token_str = llama_token_to_piece(ctx, result.tok, params.special);
|
||||
slot.sampled = result.tok;
|
||||
|
||||
// search stop word and delete it
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue