Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.devops/full-cuda.Dockerfile
#	.devops/full-rocm.Dockerfile
#	.devops/full.Dockerfile
#	.devops/llama-cli-cuda.Dockerfile
#	.devops/llama-cli-intel.Dockerfile
#	.devops/llama-cli-rocm.Dockerfile
#	.devops/llama-cli-vulkan.Dockerfile
#	.devops/llama-cli.Dockerfile
#	.devops/llama-server-cuda.Dockerfile
#	.devops/llama-server-intel.Dockerfile
#	.devops/llama-server-rocm.Dockerfile
#	.devops/llama-server-vulkan.Dockerfile
#	.devops/llama-server.Dockerfile
#	CMakeLists.txt
#	CONTRIBUTING.md
#	Makefile
#	ggml/CMakeLists.txt
#	ggml/src/CMakeLists.txt
#	requirements.txt
#	src/llama.cpp
#	tests/test-backend-ops.cpp
This commit is contained in:
Concedo 2024-07-19 14:23:33 +08:00
commit 24b9616344
61 changed files with 12994 additions and 936 deletions

View file

@ -211,7 +211,7 @@ retrieval:
--context-file FNAME file to load context from (repeat to specify multiple files)
--chunk-size N minimum length of embedded text chunks (default: 64)
--chunk-separator STRING
--chunk-separator STRING
separator between chunks (default: '
')
@ -256,7 +256,7 @@ server:
--threads-http N number of threads used to process HTTP requests (default: -1)
--system-prompt-file FNAME
set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications
--log-format {text,json}
--log-format {text,json}
log output format: json or text (default: json)
--metrics enable prometheus compatible metrics endpoint (default: disabled)
--no-slots disables slots monitoring endpoint (default: enabled)

View file

@ -21,7 +21,7 @@ let generation_settings = null;
//
export async function* llama(prompt, params = {}, config = {}) {
let controller = config.controller;
const api_url = config.api_url || "";
const api_url = config.api_url?.replace(/\/+$/, '') || "";
if (!controller) {
controller = new AbortController();
@ -196,7 +196,7 @@ export const llamaComplete = async (params, controller, callback) => {
// Get the model info from the server. This is useful for getting the context window and so on.
export const llamaModelInfo = async (config = {}) => {
if (!generation_settings) {
const api_url = config.api_url || "";
const api_url = config.api_url?.replace(/\/+$/, '') || "";
const props = await fetch(`${api_url}/props`).then(r => r.json());
generation_settings = props.default_generation_settings;
}

View file

@ -14,10 +14,10 @@
<script type="module">
import {
html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
} from '/index.js';
} from './index.js';
import { llama } from '/completion.js';
import { SchemaConverter } from '/json-schema-to-grammar.mjs';
import { llama } from './completion.js';
import { SchemaConverter } from './json-schema-to-grammar.mjs';
import { promptFormats } from './prompt-formats.js';
import { systemPrompts } from './system-prompts.js'; // multilingual is wip
let selected_image = false;
@ -225,7 +225,7 @@
throw new Error("already running");
}
controller.value = new AbortController();
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: URL.parse('.', document.baseURI).href })) {
const data = chunk.data;
if (data.stop) {
while (

View file

@ -479,7 +479,7 @@
throw new Error("already running");
}
controller.value = new AbortController();
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: location.pathname.replace(/\/+$/, '') })) {
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: URL.parse('.', document.baseURI).href })) {
const data = chunk.data;
if (data.stop) {

View file

@ -1183,7 +1183,7 @@ struct server_context {
bool process_token(completion_token_output & result, server_slot & slot) {
// remember which tokens were sampled - used for repetition penalties during sampling
const std::string token_str = llama_token_to_piece(ctx, result.tok, false);
const std::string token_str = llama_token_to_piece(ctx, result.tok, params.special);
slot.sampled = result.tok;
// search stop word and delete it