mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # .github/workflows/server.yml # CMakeLists.txt # cmake/build-info.cmake # examples/run/CMakeLists.txt # examples/run/run.cpp # examples/simple-chat/simple-chat.cpp # tests/CMakeLists.txt # tests/test-backend-ops.cpp # tests/test-sampling.cpp
This commit is contained in:
commit
5329df2bdf
24 changed files with 3359 additions and 603 deletions
|
@ -696,6 +696,9 @@ class Model:
|
|||
if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
|
||||
# ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
|
||||
res = "deepseek-v3"
|
||||
if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
|
||||
# ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
|
||||
res = "deepseek-r1-qwen"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
|
|
|
@ -108,6 +108,7 @@ models = [
|
|||
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
|
||||
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
|
||||
{"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
|
||||
{"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
|
||||
]
|
||||
|
||||
|
||||
|
|
26
examples/run/linenoise.cpp/LICENSE
Normal file
26
examples/run/linenoise.cpp/LICENSE
Normal file
|
@ -0,0 +1,26 @@
|
|||
Copyright (c) 2010-2014, Salvatore Sanfilippo <antirez at gmail dot com>
|
||||
Copyright (c) 2010-2013, Pieter Noordhuis <pcnoordhuis at gmail dot com>
|
||||
Copyright (c) 2025, Eric Curtin <ericcurtin17 at gmail dot com>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
1351
examples/run/linenoise.cpp/linenoise.cpp
Normal file
1351
examples/run/linenoise.cpp/linenoise.cpp
Normal file
File diff suppressed because it is too large
Load diff
114
examples/run/linenoise.cpp/linenoise.h
Normal file
114
examples/run/linenoise.cpp/linenoise.h
Normal file
|
@ -0,0 +1,114 @@
|
|||
/* linenoise.h -- VERSION 1.0
|
||||
*
|
||||
* Guerrilla line editing library against the idea that a line editing lib
|
||||
* needs to be 20,000 lines of C++ code.
|
||||
*
|
||||
* See linenoise.cpp for more information.
|
||||
*
|
||||
* ------------------------------------------------------------------------
|
||||
*
|
||||
* Copyright (c) 2010-2023, Salvatore Sanfilippo <antirez at gmail dot com>
|
||||
* Copyright (c) 2010-2013, Pieter Noordhuis <pcnoordhuis at gmail dot com>
|
||||
* Copyright (c) 2025, Eric Curtin <ericcurtin17 at gmail dot com>
|
||||
*
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef __LINENOISE_H
|
||||
#define __LINENOISE_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stddef.h> /* For size_t. */
|
||||
|
||||
extern const char *linenoiseEditMore;
|
||||
|
||||
/* The linenoiseState structure represents the state during line editing.
|
||||
* We pass this state to functions implementing specific editing
|
||||
* functionalities. */
|
||||
struct linenoiseState {
|
||||
int in_completion; /* The user pressed TAB and we are now in completion
|
||||
* mode, so input is handled by completeLine(). */
|
||||
size_t completion_idx; /* Index of next completion to propose. */
|
||||
int ifd; /* Terminal stdin file descriptor. */
|
||||
int ofd; /* Terminal stdout file descriptor. */
|
||||
char *buf; /* Edited line buffer. */
|
||||
size_t buflen; /* Edited line buffer size. */
|
||||
const char *prompt; /* Prompt to display. */
|
||||
size_t plen; /* Prompt length. */
|
||||
size_t pos; /* Current cursor position. */
|
||||
size_t oldpos; /* Previous refresh cursor position. */
|
||||
size_t len; /* Current edited line length. */
|
||||
size_t cols; /* Number of columns in terminal. */
|
||||
size_t oldrows; /* Rows used by last refrehsed line (multiline mode) */
|
||||
int history_index; /* The history index we are currently editing. */
|
||||
};
|
||||
|
||||
typedef struct linenoiseCompletions {
|
||||
size_t len;
|
||||
char **cvec;
|
||||
} linenoiseCompletions;
|
||||
|
||||
/* Non blocking API. */
|
||||
int linenoiseEditStart(struct linenoiseState *l, int stdin_fd, int stdout_fd, char *buf, size_t buflen, const char *prompt);
|
||||
const char *linenoiseEditFeed(struct linenoiseState *l);
|
||||
void linenoiseEditStop(struct linenoiseState *l);
|
||||
void linenoiseHide(struct linenoiseState *l);
|
||||
void linenoiseShow(struct linenoiseState *l);
|
||||
|
||||
/* Blocking API. */
|
||||
const char *linenoise(const char *prompt);
|
||||
void linenoiseFree(void *ptr);
|
||||
|
||||
/* Completion API. */
|
||||
typedef void(linenoiseCompletionCallback)(const char *, linenoiseCompletions *);
|
||||
typedef const char*(linenoiseHintsCallback)(const char *, int *color, int *bold);
|
||||
typedef void(linenoiseFreeHintsCallback)(const char *);
|
||||
void linenoiseSetCompletionCallback(linenoiseCompletionCallback *);
|
||||
void linenoiseSetHintsCallback(linenoiseHintsCallback *);
|
||||
void linenoiseSetFreeHintsCallback(linenoiseFreeHintsCallback *);
|
||||
void linenoiseAddCompletion(linenoiseCompletions *, const char *);
|
||||
|
||||
/* History API. */
|
||||
int linenoiseHistoryAdd(const char *line);
|
||||
int linenoiseHistorySetMaxLen(int len);
|
||||
int linenoiseHistorySave(const char *filename);
|
||||
int linenoiseHistoryLoad(const char *filename);
|
||||
|
||||
/* Other utilities. */
|
||||
void linenoiseClearScreen(void);
|
||||
void linenoiseSetMultiLine(int ml);
|
||||
void linenoisePrintKeyCodes(void);
|
||||
void linenoiseMaskModeEnable(void);
|
||||
void linenoiseMaskModeDisable(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* __LINENOISE_H */
|
File diff suppressed because it is too large
Load diff
|
@ -19,6 +19,7 @@
|
|||
#include "loading.html.hpp"
|
||||
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
#include <condition_variable>
|
||||
#include <cstddef>
|
||||
#include <cinttypes>
|
||||
|
@ -32,6 +33,8 @@
|
|||
|
||||
using json = nlohmann::ordered_json;
|
||||
|
||||
constexpr int HTTP_POLLING_SECONDS = 1;
|
||||
|
||||
enum stop_type {
|
||||
STOP_TYPE_NONE,
|
||||
STOP_TYPE_EOS,
|
||||
|
@ -1602,6 +1605,30 @@ struct server_response {
|
|||
// should never reach here
|
||||
}
|
||||
|
||||
// same as recv(), but have timeout in seconds
|
||||
// if timeout is reached, nullptr is returned
|
||||
server_task_result_ptr recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout) {
|
||||
while (true) {
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
bool cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout), [&]{
|
||||
return !queue_results.empty();
|
||||
});
|
||||
if (!cr_res) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
for (int i = 0; i < (int) queue_results.size(); i++) {
|
||||
if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
|
||||
server_task_result_ptr res = std::move(queue_results[i]);
|
||||
queue_results.erase(queue_results.begin() + i);
|
||||
return res;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// should never reach here
|
||||
}
|
||||
|
||||
// single-task version of recv()
|
||||
server_task_result_ptr recv(int id_task) {
|
||||
std::unordered_set<int> id_tasks = {id_task};
|
||||
|
@ -2322,10 +2349,21 @@ struct server_context {
|
|||
void receive_multi_results(
|
||||
const std::unordered_set<int> & id_tasks,
|
||||
const std::function<void(std::vector<server_task_result_ptr>&)> & result_handler,
|
||||
const std::function<void(json)> & error_handler) {
|
||||
const std::function<void(json)> & error_handler,
|
||||
const std::function<bool()> & is_connection_closed) {
|
||||
std::vector<server_task_result_ptr> results(id_tasks.size());
|
||||
for (size_t i = 0; i < id_tasks.size(); i++) {
|
||||
server_task_result_ptr result = queue_results.recv(id_tasks);
|
||||
for (int i = 0; i < (int)id_tasks.size(); i++) {
|
||||
server_task_result_ptr result = queue_results.recv_with_timeout(id_tasks, HTTP_POLLING_SECONDS);
|
||||
|
||||
if (is_connection_closed()) {
|
||||
cancel_tasks(id_tasks);
|
||||
return;
|
||||
}
|
||||
|
||||
if (result == nullptr) {
|
||||
i--; // retry
|
||||
continue;
|
||||
}
|
||||
|
||||
if (result->is_error()) {
|
||||
error_handler(result->to_json());
|
||||
|
@ -2349,10 +2387,20 @@ struct server_context {
|
|||
void receive_cmpl_results_stream(
|
||||
const std::unordered_set<int> & id_tasks,
|
||||
const std::function<bool(server_task_result_ptr&)> & result_handler,
|
||||
const std::function<void(json)> & error_handler) {
|
||||
const std::function<void(json)> & error_handler,
|
||||
const std::function<bool()> & is_connection_closed) {
|
||||
size_t n_finished = 0;
|
||||
while (true) {
|
||||
server_task_result_ptr result = queue_results.recv(id_tasks);
|
||||
server_task_result_ptr result = queue_results.recv_with_timeout(id_tasks, HTTP_POLLING_SECONDS);
|
||||
|
||||
if (is_connection_closed()) {
|
||||
cancel_tasks(id_tasks);
|
||||
return;
|
||||
}
|
||||
|
||||
if (result == nullptr) {
|
||||
continue; // retry
|
||||
}
|
||||
|
||||
if (result->is_error()) {
|
||||
error_handler(result->to_json());
|
||||
|
@ -3633,6 +3681,7 @@ int main(int argc, char ** argv) {
|
|||
const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
|
||||
server_task_type type,
|
||||
json & data,
|
||||
std::function<bool()> is_connection_closed,
|
||||
httplib::Response & res,
|
||||
oaicompat_type oaicompat) {
|
||||
GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
|
||||
|
@ -3694,7 +3743,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
}, [&](const json & error_data) {
|
||||
res_error(res, error_data);
|
||||
});
|
||||
}, is_connection_closed);
|
||||
|
||||
ctx_server.queue_results.remove_waiting_task_ids(task_ids);
|
||||
} else {
|
||||
|
@ -3704,6 +3753,7 @@ int main(int argc, char ** argv) {
|
|||
if (res_json.is_array()) {
|
||||
for (const auto & res : res_json) {
|
||||
if (!server_sent_event(sink, "data", res)) {
|
||||
// sending failed (HTTP connection closed), cancel the generation
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -3713,6 +3763,9 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
}, [&](const json & error_data) {
|
||||
server_sent_event(sink, "error", error_data);
|
||||
}, [&sink]() {
|
||||
// note: do not use req.is_connection_closed here because req is already destroyed
|
||||
return !sink.is_writable();
|
||||
});
|
||||
if (oaicompat != OAICOMPAT_TYPE_NONE) {
|
||||
static const std::string ev_done = "data: [DONE]\n\n";
|
||||
|
@ -3735,6 +3788,7 @@ int main(int argc, char ** argv) {
|
|||
return handle_completions_impl(
|
||||
SERVER_TASK_TYPE_COMPLETION,
|
||||
data,
|
||||
req.is_connection_closed,
|
||||
res,
|
||||
OAICOMPAT_TYPE_NONE);
|
||||
};
|
||||
|
@ -3744,6 +3798,7 @@ int main(int argc, char ** argv) {
|
|||
return handle_completions_impl(
|
||||
SERVER_TASK_TYPE_COMPLETION,
|
||||
data,
|
||||
req.is_connection_closed,
|
||||
res,
|
||||
OAICOMPAT_TYPE_COMPLETION);
|
||||
};
|
||||
|
@ -3820,6 +3875,7 @@ int main(int argc, char ** argv) {
|
|||
return handle_completions_impl(
|
||||
SERVER_TASK_TYPE_INFILL,
|
||||
data,
|
||||
req.is_connection_closed,
|
||||
res,
|
||||
OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
|
||||
};
|
||||
|
@ -3834,6 +3890,7 @@ int main(int argc, char ** argv) {
|
|||
return handle_completions_impl(
|
||||
SERVER_TASK_TYPE_COMPLETION,
|
||||
data,
|
||||
req.is_connection_closed,
|
||||
res,
|
||||
OAICOMPAT_TYPE_CHAT);
|
||||
};
|
||||
|
@ -3980,7 +4037,7 @@ int main(int argc, char ** argv) {
|
|||
}, [&](const json & error_data) {
|
||||
res_error(res, error_data);
|
||||
error = true;
|
||||
});
|
||||
}, req.is_connection_closed);
|
||||
|
||||
ctx_server.queue_results.remove_waiting_task_ids(task_ids);
|
||||
}
|
||||
|
@ -4070,7 +4127,7 @@ int main(int argc, char ** argv) {
|
|||
}, [&](const json & error_data) {
|
||||
res_error(res, error_data);
|
||||
error = true;
|
||||
});
|
||||
}, req.is_connection_closed);
|
||||
}
|
||||
|
||||
if (error) {
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import pytest
|
||||
import requests
|
||||
import time
|
||||
from openai import OpenAI
|
||||
from utils import *
|
||||
|
@ -405,3 +406,23 @@ def test_n_probs_post_sampling():
|
|||
assert "bytes" in prob and type(prob["bytes"]) == list
|
||||
# because the test model usually output token with either 100% or 0% probability, we need to check all the top_probs
|
||||
assert any(prob["prob"] == 1.0 for prob in tok["top_probs"])
|
||||
|
||||
|
||||
def test_cancel_request():
|
||||
global server
|
||||
server.n_ctx = 4096
|
||||
server.n_predict = -1
|
||||
server.n_slots = 1
|
||||
server.server_slots = True
|
||||
server.start()
|
||||
# send a request that will take a long time, but cancel it before it finishes
|
||||
try:
|
||||
server.make_request("POST", "/completion", data={
|
||||
"prompt": "I believe the meaning of life is",
|
||||
}, timeout=0.1)
|
||||
except requests.exceptions.ReadTimeout:
|
||||
pass # expected
|
||||
# make sure the slot is free
|
||||
time.sleep(1) # wait for HTTP_POLLING_SECONDS
|
||||
res = server.make_request("GET", "/slots")
|
||||
assert res.body[0]["is_processing"] == False
|
||||
|
|
|
@ -26,6 +26,9 @@ from re import RegexFlag
|
|||
import wget
|
||||
|
||||
|
||||
DEFAULT_HTTP_TIMEOUT = 10 if "LLAMA_SANITIZE" not in os.environ else 30
|
||||
|
||||
|
||||
class ServerResponse:
|
||||
headers: dict
|
||||
status_code: int
|
||||
|
@ -88,7 +91,7 @@ class ServerProcess:
|
|||
if "PORT" in os.environ:
|
||||
self.server_port = int(os.environ["PORT"])
|
||||
|
||||
def start(self, timeout_seconds: int = 10) -> None:
|
||||
def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
|
||||
if "LLAMA_SERVER_BIN_PATH" in os.environ:
|
||||
server_path = os.environ["LLAMA_SERVER_BIN_PATH"]
|
||||
elif os.name == "nt":
|
||||
|
@ -219,17 +222,18 @@ class ServerProcess:
|
|||
path: str,
|
||||
data: dict | Any | None = None,
|
||||
headers: dict | None = None,
|
||||
timeout: float | None = None,
|
||||
) -> ServerResponse:
|
||||
url = f"http://{self.server_host}:{self.server_port}{path}"
|
||||
parse_body = False
|
||||
if method == "GET":
|
||||
response = requests.get(url, headers=headers)
|
||||
response = requests.get(url, headers=headers, timeout=timeout)
|
||||
parse_body = True
|
||||
elif method == "POST":
|
||||
response = requests.post(url, headers=headers, json=data)
|
||||
response = requests.post(url, headers=headers, json=data, timeout=timeout)
|
||||
parse_body = True
|
||||
elif method == "OPTIONS":
|
||||
response = requests.options(url, headers=headers)
|
||||
response = requests.options(url, headers=headers, timeout=timeout)
|
||||
else:
|
||||
raise ValueError(f"Unimplemented method: {method}")
|
||||
result = ServerResponse()
|
||||
|
|
|
@ -425,6 +425,33 @@ static void prompt_init(llama_tokens & prompt, const llama_vocab * vocab) {
|
|||
prompt_add(prompt, vocab, "<|im_start|>\n", true, true);
|
||||
}
|
||||
|
||||
static std::vector<llama_token> prepare_guide_tokens(const llama_vocab * vocab, const std::string & str) {
|
||||
const std::string& delimiter = "<|text_sep|>";
|
||||
|
||||
std::vector<llama_token> result;
|
||||
size_t start = 0;
|
||||
size_t end = str.find(delimiter);
|
||||
|
||||
//first token is always a newline, as it was not previously added
|
||||
result.push_back(common_tokenize(vocab, "\n", false, true)[0]);
|
||||
|
||||
while (end != std::string::npos) {
|
||||
std::string current_word = str.substr(start, end - start);
|
||||
auto tmp = common_tokenize(vocab, current_word, false, true);
|
||||
result.push_back(tmp[0]);
|
||||
start = end + delimiter.length();
|
||||
end = str.find(delimiter, start);
|
||||
}
|
||||
|
||||
// Add the last part
|
||||
std::string current_word = str.substr(start);
|
||||
auto tmp = common_tokenize(vocab, current_word, false, true);
|
||||
if (tmp.size() > 0) {
|
||||
result.push_back(tmp[0]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
|
@ -494,6 +521,7 @@ int main(int argc, char ** argv) {
|
|||
const auto t_main_start = ggml_time_us();
|
||||
|
||||
std::vector<llama_token> codes;
|
||||
std::vector<llama_token> guide_tokens;
|
||||
|
||||
// process prompt and generate voice codes
|
||||
{
|
||||
|
@ -508,6 +536,9 @@ int main(int argc, char ** argv) {
|
|||
// convert the input text into the necessary format expected by OuteTTS
|
||||
{
|
||||
std::string prompt_clean = process_text(params.prompt);
|
||||
if (params.vocoder.use_guide_tokens) {
|
||||
guide_tokens = prepare_guide_tokens(vocab, prompt_clean);
|
||||
}
|
||||
|
||||
LOG_INF("%s: prompt: '%s'\n", __func__, prompt_clean.c_str());
|
||||
|
||||
|
@ -717,6 +748,8 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
|
|||
int n_past = batch.n_tokens;
|
||||
int n_decode = 0;
|
||||
|
||||
bool next_token_uses_guide_token = true;
|
||||
|
||||
while (n_decode <= n_predict) {
|
||||
// prepare the next batch
|
||||
common_batch_clear(batch);
|
||||
|
@ -728,7 +761,17 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
|
|||
continue;
|
||||
}
|
||||
|
||||
const llama_token new_token_id = common_sampler_sample(smpl[i], ctx_ttc, i_batch[i]);
|
||||
llama_token new_token_id = common_sampler_sample(smpl[i], ctx_ttc, i_batch[i]);
|
||||
|
||||
//guide tokens help prevent hallucinations by forcing the TTS to use the correct word
|
||||
if (!guide_tokens.empty() && next_token_uses_guide_token && !llama_vocab_is_control(vocab, new_token_id) && !llama_vocab_is_eog(vocab, new_token_id)) {
|
||||
llama_token guide_token = guide_tokens[0];
|
||||
guide_tokens.erase(guide_tokens.begin());
|
||||
new_token_id = guide_token; //ensure correct word fragment is used
|
||||
}
|
||||
|
||||
//this is the token id that always precedes a new word
|
||||
next_token_uses_guide_token = (new_token_id == 198);
|
||||
|
||||
common_sampler_accept(smpl[i], new_token_id, true);
|
||||
|
||||
|
|
|
@ -333,8 +333,12 @@ struct ggml_backend_sycl_context {
|
|||
// pool
|
||||
std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
|
||||
|
||||
std::unique_ptr<ggml_sycl_pool> host_pools[GGML_SYCL_MAX_DEVICES];
|
||||
|
||||
static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device);
|
||||
|
||||
static std::unique_ptr<ggml_sycl_pool> new_pool_for_host(queue_ptr qptr, int device);
|
||||
|
||||
ggml_sycl_pool & pool(int device) {
|
||||
if (pools[device] == nullptr) {
|
||||
pools[device] = new_pool_for_device(stream(device,0), device);
|
||||
|
@ -345,6 +349,15 @@ struct ggml_backend_sycl_context {
|
|||
ggml_sycl_pool & pool() {
|
||||
return pool(device);
|
||||
}
|
||||
|
||||
ggml_sycl_pool & host_pool(int device) {
|
||||
if (host_pools[device] == nullptr) {
|
||||
host_pools[device] = new_pool_for_host(stream(device, 0), device);
|
||||
}
|
||||
return *host_pools[device];
|
||||
}
|
||||
|
||||
ggml_sycl_pool & host_pool() { return host_pool(device); }
|
||||
};
|
||||
|
||||
// common device functions
|
||||
|
|
|
@ -82,6 +82,14 @@ inline std::string get_device_backend_and_type(const sycl::device &device) {
|
|||
return device_type.str();
|
||||
}
|
||||
|
||||
template <typename Ts> struct matrix_info_t {
|
||||
oneapi::mkl::transpose transpose_info[2];
|
||||
Ts value_info[2];
|
||||
std::int64_t size_info[3];
|
||||
std::int64_t ld_info[3];
|
||||
std::int64_t groupsize_info;
|
||||
};
|
||||
|
||||
namespace dpct
|
||||
{
|
||||
typedef sycl::queue *queue_ptr;
|
||||
|
@ -1727,26 +1735,13 @@ namespace dpct
|
|||
};
|
||||
|
||||
template <class Ta, class Tb, class Tc, class Ts>
|
||||
inline void gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
|
||||
oneapi::mkl::transpose b_trans, int m, int n, int k,
|
||||
const void *alpha, const void **a, int lda,
|
||||
const void **b, int ldb, const void *beta, void **c,
|
||||
int ldc, int batch_size)
|
||||
{
|
||||
struct matrix_info_t
|
||||
{
|
||||
oneapi::mkl::transpose transpose_info[2];
|
||||
Ts value_info[2];
|
||||
std::int64_t size_info[3];
|
||||
std::int64_t ld_info[3];
|
||||
std::int64_t groupsize_info;
|
||||
};
|
||||
|
||||
inline void gemm_batch_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans,
|
||||
int m, int n, int k, const void * alpha, const void ** a, int lda, const void ** b,
|
||||
int ldb, const void * beta, void ** c, int ldc, int batch_size,
|
||||
matrix_info_t<float> * matrix_info) {
|
||||
Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
|
||||
Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
|
||||
|
||||
matrix_info_t *matrix_info =
|
||||
(matrix_info_t *)std::malloc(sizeof(matrix_info_t));
|
||||
matrix_info->transpose_info[0] = a_trans;
|
||||
matrix_info->transpose_info[1] = b_trans;
|
||||
matrix_info->value_info[0] = alpha_value;
|
||||
|
@ -1763,23 +1758,18 @@ namespace dpct
|
|||
sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
|
||||
oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q }, matrix_info->transpose_info,
|
||||
matrix_info->transpose_info + 1, matrix_info->size_info, matrix_info->size_info + 1,
|
||||
matrix_info->size_info + 2, matrix_info->value_info, reinterpret_cast<const Ta **>(a),
|
||||
matrix_info->ld_info, reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
|
||||
matrix_info->value_info + 1, reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1,
|
||||
&(matrix_info->groupsize_info));
|
||||
matrix_info->size_info + 2, reinterpret_cast<Ts *>(matrix_info->value_info),
|
||||
reinterpret_cast<const Ta **>(a), matrix_info->ld_info, reinterpret_cast<const Tb **>(b),
|
||||
matrix_info->ld_info + 1, reinterpret_cast<Ts *>(matrix_info->value_info + 1),
|
||||
reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
|
||||
#else
|
||||
sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
|
||||
q, matrix_info->transpose_info, matrix_info->transpose_info + 1, matrix_info->size_info,
|
||||
matrix_info->size_info + 1, matrix_info->size_info + 2, matrix_info->value_info,
|
||||
matrix_info->size_info + 1, matrix_info->size_info + 2, reinterpret_cast<Ts *>(matrix_info->value_info),
|
||||
reinterpret_cast<const Ta **>(a), matrix_info->ld_info, reinterpret_cast<const Tb **>(b),
|
||||
matrix_info->ld_info + 1, matrix_info->value_info + 1, reinterpret_cast<Tc **>(c),
|
||||
matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
|
||||
matrix_info->ld_info + 1, reinterpret_cast<Ts *>(matrix_info->value_info + 1),
|
||||
reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
|
||||
#endif
|
||||
|
||||
q.submit([&](sycl::handler &cgh)
|
||||
{
|
||||
cgh.depends_on(e);
|
||||
cgh.host_task([=] { std::free(matrix_info); }); });
|
||||
}
|
||||
|
||||
template <class Ta, class Tb, class Tc, class Ts>
|
||||
|
@ -2422,25 +2412,11 @@ namespace dpct
|
|||
/// \param [in] ldc Leading dimension of C.
|
||||
/// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
|
||||
/// \param [in] scaling_type Data type of the scaling factors.
|
||||
inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
|
||||
oneapi::mkl::transpose b_trans, int m, int n, int k,
|
||||
const void *alpha, const void *a[],
|
||||
library_data_t a_type, int lda, const void *b[],
|
||||
library_data_t b_type, int ldb, const void *beta,
|
||||
void *c[], library_data_t c_type, int ldc,
|
||||
int batch_size, library_data_t scaling_type)
|
||||
{
|
||||
if (scaling_type == library_data_t::real_float &&
|
||||
c_type == library_data_t::complex_float)
|
||||
{
|
||||
scaling_type = library_data_t::complex_float;
|
||||
}
|
||||
else if (scaling_type == library_data_t::real_double &&
|
||||
c_type == library_data_t::complex_double)
|
||||
{
|
||||
scaling_type = library_data_t::complex_double;
|
||||
}
|
||||
|
||||
inline void gemm_batch(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m,
|
||||
int n, int k, const void * alpha, const void * a[], library_data_t a_type, int lda,
|
||||
const void * b[], library_data_t b_type, int ldb, const void * beta, void * c[],
|
||||
library_data_t c_type, int ldc, int batch_size, library_data_t scaling_type,
|
||||
matrix_info_t<float> * matrix_info) {
|
||||
std::uint64_t key =
|
||||
detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
|
||||
switch (key)
|
||||
|
@ -2449,48 +2425,24 @@ namespace dpct
|
|||
library_data_t::real_float, library_data_t::real_float,
|
||||
library_data_t::real_float, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_batch_impl<float, float, float, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
|
||||
batch_size);
|
||||
detail::gemm_batch_impl<float, float, float, float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb,
|
||||
beta, c, ldc, batch_size, matrix_info);
|
||||
break;
|
||||
}
|
||||
case detail::get_type_combination_id(
|
||||
library_data_t::real_double, library_data_t::real_double,
|
||||
library_data_t::real_double, library_data_t::real_double):
|
||||
{
|
||||
detail::gemm_batch_impl<double, double, double, double>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
|
||||
batch_size);
|
||||
break;
|
||||
}
|
||||
case detail::get_type_combination_id(
|
||||
library_data_t::complex_float, library_data_t::complex_float,
|
||||
library_data_t::complex_float, library_data_t::complex_float):
|
||||
{
|
||||
detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
|
||||
std::complex<float>, std::complex<float>>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
|
||||
batch_size);
|
||||
break;
|
||||
}
|
||||
case detail::get_type_combination_id(
|
||||
library_data_t::complex_double, library_data_t::complex_double,
|
||||
library_data_t::complex_double, library_data_t::complex_double):
|
||||
{
|
||||
detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
|
||||
std::complex<double>, std::complex<double>>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
|
||||
batch_size);
|
||||
detail::gemm_batch_impl<double, double, double, double>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb,
|
||||
beta, c, ldc, batch_size, matrix_info);
|
||||
break;
|
||||
}
|
||||
case detail::get_type_combination_id(
|
||||
library_data_t::real_half, library_data_t::real_half,
|
||||
library_data_t::real_half, library_data_t::real_half):
|
||||
{
|
||||
detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
|
||||
sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
|
||||
a, lda, b, ldb, beta, c, ldc,
|
||||
batch_size);
|
||||
detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
|
||||
break;
|
||||
}
|
||||
#ifdef __INTEL_MKL__
|
||||
|
@ -2498,19 +2450,16 @@ namespace dpct
|
|||
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
|
||||
library_data_t::real_bfloat16, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
|
||||
oneapi::mkl::bfloat16, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
|
||||
batch_size);
|
||||
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
|
||||
break;
|
||||
}
|
||||
case detail::get_type_combination_id(
|
||||
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
|
||||
library_data_t::real_float, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
|
||||
float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
|
||||
b, ldb, beta, c, ldc, batch_size);
|
||||
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
@ -2522,10 +2471,9 @@ namespace dpct
|
|||
dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
|
||||
float beta_float =
|
||||
dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
|
||||
detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
|
||||
float>(q, a_trans, b_trans, m, n, k, &alpha_float,
|
||||
a, lda, b, ldb, &beta_float, c, ldc,
|
||||
batch_size);
|
||||
detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t, float>(
|
||||
q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc, batch_size,
|
||||
matrix_info);
|
||||
break;
|
||||
}
|
||||
case detail::get_type_combination_id(
|
||||
|
@ -2533,8 +2481,7 @@ namespace dpct
|
|||
library_data_t::real_float, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
|
||||
batch_size);
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
|
||||
break;
|
||||
}
|
||||
case detail::get_type_combination_id(
|
||||
|
@ -2542,8 +2489,7 @@ namespace dpct
|
|||
library_data_t::real_float, library_data_t::real_float):
|
||||
{
|
||||
detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
|
||||
batch_size);
|
||||
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
|
||||
break;
|
||||
}
|
||||
case detail::get_type_combination_id(
|
||||
|
@ -2557,8 +2503,7 @@ namespace dpct
|
|||
sycl::half alpha_half(alpha_value);
|
||||
sycl::half beta_half(beta_value);
|
||||
detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
|
||||
q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc,
|
||||
batch_size);
|
||||
q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc, batch_size, matrix_info);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
|
|
|
@ -1173,6 +1173,85 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
|||
}
|
||||
};
|
||||
|
||||
struct ggml_sycl_pool_host : public ggml_sycl_pool {
|
||||
queue_ptr qptr;
|
||||
int device;
|
||||
|
||||
inline static int counter{ 0 };
|
||||
|
||||
struct ggml_sycl_buffer {
|
||||
void * ptr = nullptr;
|
||||
size_t size = 0;
|
||||
};
|
||||
|
||||
// Set arbitrarly to 64
|
||||
static constexpr int MAX_POOL_SIZE{ 64 };
|
||||
std::vector<ggml_sycl_buffer> buffer_pool = std::vector<ggml_sycl_buffer>(MAX_POOL_SIZE);
|
||||
size_t pool_size = 0;
|
||||
|
||||
explicit ggml_sycl_pool_host(queue_ptr qptr_, int device_) : qptr(qptr_), device(device_) {}
|
||||
|
||||
~ggml_sycl_pool_host() {
|
||||
for (int i = 0; i < MAX_POOL_SIZE; ++i) {
|
||||
ggml_sycl_buffer & b = buffer_pool[i];
|
||||
if (b.ptr != nullptr) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(b.ptr, *qptr)));
|
||||
b.ptr = nullptr;
|
||||
pool_size -= b.size;
|
||||
b.size = 0;
|
||||
}
|
||||
}
|
||||
counter = 0;
|
||||
}
|
||||
|
||||
void * alloc(size_t size, size_t * actual_size) override {
|
||||
if (counter == MAX_POOL_SIZE) {
|
||||
ggml_sycl_buffer b = buffer_pool[0];
|
||||
void * ptr = b.ptr;
|
||||
*actual_size = b.size;
|
||||
counter = 1;
|
||||
return ptr;
|
||||
}
|
||||
ggml_sycl_buffer & b = buffer_pool[counter];
|
||||
|
||||
if (b.ptr == nullptr) {
|
||||
void * ptr;
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(ptr = (void *) sycl::malloc_host(size, *qptr)));
|
||||
if (!ptr) {
|
||||
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on host\n", __func__, size);
|
||||
return nullptr;
|
||||
}
|
||||
pool_size += size;
|
||||
*actual_size = size;
|
||||
counter = counter + 1;
|
||||
return ptr;
|
||||
} else {
|
||||
++counter;
|
||||
b.size = size;
|
||||
return b.ptr;
|
||||
}
|
||||
}
|
||||
|
||||
void free(void * ptr, size_t size) override {
|
||||
// if the pool is not completed add the pointer to it in place of the first nullptr found.
|
||||
// Otherwise do nothing, pointers will be freed once the pool is deallocated.
|
||||
for (int i = 0; i < MAX_POOL_SIZE; ++i) {
|
||||
ggml_sycl_buffer & b = buffer_pool[i];
|
||||
if (b.ptr == nullptr) {
|
||||
b.ptr = ptr;
|
||||
b.size = size;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_host(queue_ptr qptr, int device) {
|
||||
// return pool for the host to speed up memory management
|
||||
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_host(qptr, device));
|
||||
}
|
||||
|
||||
std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) {
|
||||
// TBD: NO VMM support
|
||||
// if (ggml_sycl_info().devices[device].vmm) {
|
||||
|
@ -3363,6 +3442,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
|
|||
|
||||
ggml_sycl_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
|
||||
ggml_sycl_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23);
|
||||
ggml_sycl_pool_alloc<matrix_info_t<float>> matrix_info(ctx.host_pool(), 1);
|
||||
|
||||
sycl::range<3> block_dims(1, ne12, ne13);
|
||||
/*
|
||||
|
@ -3391,14 +3471,10 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
|
|||
});
|
||||
}
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
||||
*main_stream, oneapi::mkl::transpose::trans,
|
||||
oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
(const void **)(ptrs_src.get() + 0 * ne23),
|
||||
dpct::library_data_t::real_half, nb01 / nb00,
|
||||
(const void **)(ptrs_src.get() + 1 * ne23),
|
||||
dpct::library_data_t::real_half, nb11 / nb10, beta,
|
||||
(void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
|
||||
cu_compute_type)));
|
||||
*main_stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
|
||||
(const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
|
||||
(const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, nb11 / nb10, beta,
|
||||
(void **) (ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23, cu_compute_type, matrix_info.get())));
|
||||
}
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
|
|
|
@ -390,10 +390,13 @@ struct vk_flash_attn_push_constants {
|
|||
uint32_t nev3;
|
||||
uint32_t nem1;
|
||||
|
||||
uint32_t nb01;
|
||||
uint32_t nb02;
|
||||
uint32_t nb03;
|
||||
uint32_t nb11;
|
||||
uint32_t nb12;
|
||||
uint32_t nb13;
|
||||
uint32_t nb21;
|
||||
uint32_t nb22;
|
||||
uint32_t nb23;
|
||||
uint32_t nb31;
|
||||
|
@ -4817,7 +4820,14 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|||
}
|
||||
assert(pipelines);
|
||||
|
||||
bool aligned = (KV % pipelines[1]->align) == 0;
|
||||
const uint32_t q_stride = (uint32_t)(nbq1 / ggml_type_size(q->type));
|
||||
const uint32_t k_stride = (uint32_t)(nbk1 / ggml_type_size(k->type));
|
||||
const uint32_t v_stride = (uint32_t)(nbv1 / ggml_type_size(v->type));
|
||||
|
||||
bool aligned = (KV % pipelines[1]->align) == 0 &&
|
||||
// the "aligned" shader variant will forcibly align strides, for performance
|
||||
(q_stride & 7) == 0 && (k_stride & 7) == 0 && (v_stride & 7) == 0;
|
||||
|
||||
vk_pipeline pipeline = pipelines[aligned];
|
||||
assert(pipeline);
|
||||
|
||||
|
@ -4853,15 +4863,15 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|||
|
||||
if (ctx->device->uma) {
|
||||
ggml_vk_host_get(ctx->device, q->data, d_Q, q_buf_offset);
|
||||
ggml_vk_host_get(ctx->device, k->data, d_K, q_buf_offset);
|
||||
ggml_vk_host_get(ctx->device, v->data, d_V, q_buf_offset);
|
||||
ggml_vk_host_get(ctx->device, dst->data, d_D, q_buf_offset);
|
||||
ggml_vk_host_get(ctx->device, k->data, d_K, k_buf_offset);
|
||||
ggml_vk_host_get(ctx->device, v->data, d_V, v_buf_offset);
|
||||
ggml_vk_host_get(ctx->device, dst->data, d_D, d_buf_offset);
|
||||
Q_uma = d_Q != nullptr;
|
||||
K_uma = d_K != nullptr;
|
||||
V_uma = d_V != nullptr;
|
||||
D_uma = d_D != nullptr;
|
||||
if (mask) {
|
||||
ggml_vk_host_get(ctx->device, mask->data, d_M, q_buf_offset);
|
||||
ggml_vk_host_get(ctx->device, mask->data, d_M, m_buf_offset);
|
||||
M_uma = d_M != nullptr;
|
||||
}
|
||||
}
|
||||
|
@ -4899,7 +4909,18 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|||
}
|
||||
}
|
||||
|
||||
const vk_flash_attn_push_constants pc = { N, KV, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, (uint32_t)neq2, (uint32_t)neq3, (uint32_t)nek2, (uint32_t)nek3, (uint32_t)nev2, (uint32_t)nev3, nem1, (uint32_t)nbq2, (uint32_t)nbq3, (uint32_t)nbk2, (uint32_t)nbk3, (uint32_t)nbv2, (uint32_t)nbv3, nbm1, scale, max_bias, logit_softcap, mask != nullptr, n_head_log2, m0, m1 };
|
||||
const vk_flash_attn_push_constants pc = { N, KV,
|
||||
(uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3,
|
||||
(uint32_t)neq2, (uint32_t)neq3,
|
||||
(uint32_t)nek2, (uint32_t)nek3,
|
||||
(uint32_t)nev2, (uint32_t)nev3,
|
||||
nem1,
|
||||
q_stride, (uint32_t)nbq2, (uint32_t)nbq3,
|
||||
k_stride, (uint32_t)nbk2, (uint32_t)nbk3,
|
||||
v_stride, (uint32_t)nbv2, (uint32_t)nbv3,
|
||||
nbm1,
|
||||
scale, max_bias, logit_softcap,
|
||||
mask != nullptr, n_head_log2, m0, m1 };
|
||||
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
||||
{
|
||||
vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
|
||||
|
@ -8676,6 +8697,7 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
|
|||
ggml_tensor * src0 = tensor->src[0];
|
||||
ggml_tensor * src1 = tensor->src[1];
|
||||
ggml_tensor * src2 = tensor->src[2];
|
||||
ggml_tensor * src3 = tensor->src[3];
|
||||
|
||||
void * tensor_data = tensor->data;
|
||||
|
||||
|
@ -8738,6 +8760,9 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
|
|||
if (src2 != nullptr) {
|
||||
std::cerr << "src2=" << src2 << " src2->name=" << src2->name << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl;
|
||||
}
|
||||
if (src3 != nullptr) {
|
||||
std::cerr << "src3=" << src3 << " src3->name=" << src3->name << " op=" << ggml_op_name(src3->op) << " type=" << ggml_type_name(src3->type) << " ne0=" << src3->ne[0] << " nb0=" << src3->nb[0] << " ne1=" << src3->ne[1] << " nb1=" << src3->nb[1] << " ne2=" << src3->ne[2] << " nb2=" << src3->nb[2] << " ne3=" << src3->ne[3] << " nb3=" << src3->nb[3] << " offset=" << src3->view_offs << std::endl;
|
||||
}
|
||||
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
||||
std::cerr << std::endl << "Result:" << std::endl;
|
||||
ggml_vk_print_tensor_area(tensor, tensor_data, i0, i1, i2, i3);
|
||||
|
@ -8782,6 +8807,9 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
|
|||
if (src2 != nullptr) {
|
||||
std::cerr << "src2=" << src2 << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl;
|
||||
}
|
||||
if (src3 != nullptr) {
|
||||
std::cerr << "src3=" << src3 << " op=" << ggml_op_name(src3->op) << " type=" << ggml_type_name(src3->type) << " ne0=" << src3->ne[0] << " nb0=" << src3->nb[0] << " ne1=" << src3->ne[1] << " nb1=" << src3->nb[1] << " ne2=" << src3->ne[2] << " nb2=" << src3->nb[2] << " ne3=" << src3->ne[3] << " nb3=" << src3->nb[3] << " offset=" << src3->view_offs << std::endl;
|
||||
}
|
||||
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
||||
std::cerr << std::endl << "Result:" << std::endl;
|
||||
ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
|
||||
|
@ -8804,6 +8832,9 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
|
|||
if (src2 != nullptr) {
|
||||
std::cerr << "src2=" << src2 << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl;
|
||||
}
|
||||
if (src3 != nullptr) {
|
||||
std::cerr << "src3=" << src3 << " op=" << ggml_op_name(src3->op) << " type=" << ggml_type_name(src3->type) << " ne0=" << src3->ne[0] << " nb0=" << src3->nb[0] << " ne1=" << src3->ne[1] << " nb1=" << src3->nb[1] << " ne2=" << src3->ne[2] << " nb2=" << src3->nb[2] << " ne3=" << src3->ne[3] << " nb3=" << src3->nb[3] << " offset=" << src3->view_offs << std::endl;
|
||||
}
|
||||
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
|
||||
std::cerr << std::endl << "Result:" << std::endl;
|
||||
ggml_vk_print_tensor_area(tensor, tensor_data, first_error[0], first_error[1], first_error[2], first_error[3]);
|
||||
|
|
|
@ -42,10 +42,13 @@ layout (push_constant) uniform parameter {
|
|||
uint32_t nev3;
|
||||
uint32_t nem1;
|
||||
|
||||
uint32_t nb01;
|
||||
uint32_t nb02;
|
||||
uint32_t nb03;
|
||||
uint32_t nb11;
|
||||
uint32_t nb12;
|
||||
uint32_t nb13;
|
||||
uint32_t nb21;
|
||||
uint32_t nb22;
|
||||
uint32_t nb23;
|
||||
uint32_t nb31;
|
||||
|
@ -146,6 +149,23 @@ void main() {
|
|||
tensorLayoutK = setTensorLayoutDimensionNV(tensorLayoutK, KV, D);
|
||||
tensorLayoutV = setTensorLayoutDimensionNV(tensorLayoutV, KV, D);
|
||||
|
||||
// nb?1 are already divided by the type size and are in units of elements
|
||||
uint32_t q_stride = p.nb01;
|
||||
uint32_t k_stride = p.nb11;
|
||||
uint32_t v_stride = p.nb21;
|
||||
// hint to the compiler that strides are aligned for the aligned variant of the shader
|
||||
if (Clamp != gl_CooperativeMatrixClampModeConstantNV)
|
||||
{
|
||||
q_stride &= ~7;
|
||||
#if !defined(BLOCK_SIZE)
|
||||
k_stride &= ~7;
|
||||
v_stride &= ~7;
|
||||
#endif
|
||||
}
|
||||
tensorLayoutQ = setTensorLayoutStrideNV(tensorLayoutQ, q_stride, 1);
|
||||
tensorLayoutK = setTensorLayoutStrideNV(tensorLayoutK, k_stride, 1);
|
||||
tensorLayoutV = setTensorLayoutStrideNV(tensorLayoutV, v_stride, 1);
|
||||
|
||||
coopmat<Q_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseA> Q;
|
||||
coopmat<float16_t, gl_ScopeWorkgroup, Br, D, gl_MatrixUseA> Qf16;
|
||||
|
||||
|
|
|
@ -693,6 +693,10 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
|
|||
|
||||
ok = ok && data != nullptr;
|
||||
|
||||
if (ok) {
|
||||
ggml_set_name(data, "GGUF tensor data binary blob");
|
||||
}
|
||||
|
||||
// read the binary blob with the tensor data
|
||||
ok = ok && gr.read(data->data, ctx->size);
|
||||
|
||||
|
|
|
@ -59,7 +59,7 @@ maxhordelen = 400
|
|||
modelbusy = threading.Lock()
|
||||
requestsinqueue = 0
|
||||
defaultport = 5001
|
||||
KcppVersion = "1.82.1"
|
||||
KcppVersion = "1.83"
|
||||
showdebug = True
|
||||
guimode = False
|
||||
showsamplerwarning = True
|
||||
|
|
112
scripts/hf.sh
Executable file
112
scripts/hf.sh
Executable file
|
@ -0,0 +1,112 @@
|
|||
#!/bin/bash
|
||||
#
|
||||
# Shortcut for downloading HF models
|
||||
#
|
||||
# Usage:
|
||||
# ./llama-cli -m $(./scripts/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
||||
# ./llama-cli -m $(./scripts/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
||||
# ./llama-cli -m $(./scripts/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf)
|
||||
#
|
||||
|
||||
# all logs go to stderr
|
||||
function log {
|
||||
echo "$@" 1>&2
|
||||
}
|
||||
|
||||
function usage {
|
||||
log "Usage: $0 [[--url] <url>] [--repo <repo>] [--file <file>] [--outdir <dir> [-h|--help]"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# check for curl or wget
|
||||
function has_cmd {
|
||||
if ! [ -x "$(command -v $1)" ]; then
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
if has_cmd wget; then
|
||||
cmd="wget -q -c -O %s/%s %s"
|
||||
elif has_cmd curl; then
|
||||
cmd="curl -C - -f --output-dir %s -o %s -L %s"
|
||||
else
|
||||
log "[E] curl or wget not found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
url=""
|
||||
repo=""
|
||||
file=""
|
||||
outdir="."
|
||||
|
||||
# parse args
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--url)
|
||||
url="$2"
|
||||
shift 2
|
||||
;;
|
||||
--repo)
|
||||
repo="$2"
|
||||
shift 2
|
||||
;;
|
||||
--file)
|
||||
file="$2"
|
||||
shift 2
|
||||
;;
|
||||
--outdir)
|
||||
outdir="$2"
|
||||
shift 2
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
;;
|
||||
*)
|
||||
url="$1"
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ -n "$repo" ] && [ -n "$file" ]; then
|
||||
url="https://huggingface.co/$repo/resolve/main/$file"
|
||||
fi
|
||||
|
||||
if [ -z "$url" ]; then
|
||||
log "[E] missing --url"
|
||||
usage
|
||||
fi
|
||||
|
||||
# check if the URL is a HuggingFace model, and if so, try to download it
|
||||
is_url=false
|
||||
|
||||
if [[ ${#url} -gt 22 ]]; then
|
||||
if [[ ${url:0:22} == "https://huggingface.co" ]]; then
|
||||
is_url=true
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ "$is_url" = false ]; then
|
||||
log "[E] invalid URL, must start with https://huggingface.co"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# replace "blob/main" with "resolve/main"
|
||||
url=${url/blob\/main/resolve\/main}
|
||||
|
||||
basename=$(basename $url)
|
||||
|
||||
log "[+] attempting to download $basename"
|
||||
|
||||
if [ -n "$cmd" ]; then
|
||||
cmd=$(printf "$cmd" "$outdir" "$basename" "$url")
|
||||
log "[+] $cmd"
|
||||
if $cmd; then
|
||||
echo $outdir/$basename
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
|
||||
log "[-] failed to download"
|
||||
|
||||
exit 1
|
|
@ -152,7 +152,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|||
return LLM_CHAT_TEMPLATE_MINICPM;
|
||||
} else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
|
||||
return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
|
||||
} else if (tmpl_contains(LU8("'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'"))) {
|
||||
} else if (tmpl_contains(LU8("<|Assistant|>")) && tmpl_contains(LU8("<|User|>")) && tmpl_contains(LU8("<|end▁of▁sentence|>"))) {
|
||||
return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
|
||||
} else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
|
||||
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
#include <cstring>
|
||||
#include <climits>
|
||||
#include <stdexcept>
|
||||
#include <cerrno>
|
||||
|
||||
#ifdef __has_include
|
||||
#if __has_include(<unistd.h>)
|
||||
|
|
|
@ -2294,6 +2294,50 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
|
||||
|
||||
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
||||
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_PHIMOE:
|
||||
{
|
||||
const int64_t n_embd_head = n_embd / n_head;
|
||||
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
||||
|
||||
// output
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
||||
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
|
||||
output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), { n_vocab }, 0);
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
||||
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
|
||||
|
||||
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
if (layer.wqkv == nullptr) {
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
||||
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
||||
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
|
||||
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
||||
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
|
||||
}
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
|
||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
|
||||
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0);
|
||||
|
||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
||||
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
||||
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
|
||||
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
||||
|
||||
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
||||
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
||||
}
|
||||
|
|
|
@ -1758,7 +1758,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
pre_type = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "qwen2") {
|
||||
tokenizer_pre == "qwen2" ||
|
||||
tokenizer_pre == "deepseek-r1-qwen") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
|
|
|
@ -7,18 +7,17 @@
|
|||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <codecvt>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <locale>
|
||||
#include <map>
|
||||
#include <regex>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <locale>
|
||||
#include <codecvt>
|
||||
|
||||
size_t unicode_len_utf8(char src) {
|
||||
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
||||
|
|
|
@ -48,7 +48,7 @@ enum handcrafted_file_type {
|
|||
HANDCRAFTED_DATA_CUSTOM_ALIGN = 810 + offset_has_data,
|
||||
};
|
||||
|
||||
std::string handcrafted_file_type_name(const enum handcrafted_file_type hft) {
|
||||
static std::string handcrafted_file_type_name(const enum handcrafted_file_type hft) {
|
||||
switch (hft) {
|
||||
case HANDCRAFTED_HEADER_BAD_MAGIC: return "HEADER_BAD_MAGIC";
|
||||
case HANDCRAFTED_HEADER_BAD_VERSION_1: return "HEADER_BAD_VERSION_1";
|
||||
|
@ -99,7 +99,7 @@ static bool expect_context_not_null(const enum handcrafted_file_type hft) {
|
|||
|
||||
typedef std::pair<enum ggml_type, std::array<int64_t, GGML_MAX_DIMS>> tensor_config_t;
|
||||
|
||||
std::vector<tensor_config_t> get_tensor_configs(std::mt19937 & rng) {
|
||||
static std::vector<tensor_config_t> get_tensor_configs(std::mt19937 & rng) {
|
||||
std::vector<tensor_config_t> tensor_configs;
|
||||
tensor_configs.reserve(100);
|
||||
|
||||
|
@ -122,7 +122,7 @@ std::vector<tensor_config_t> get_tensor_configs(std::mt19937 & rng) {
|
|||
return tensor_configs;
|
||||
}
|
||||
|
||||
std::vector<std::pair<enum gguf_type, enum gguf_type>> get_kv_types(std::mt19937 rng) {
|
||||
static std::vector<std::pair<enum gguf_type, enum gguf_type>> get_kv_types(std::mt19937 rng) {
|
||||
std::vector<std::pair<enum gguf_type, enum gguf_type>> kv_types;
|
||||
kv_types.reserve(100);
|
||||
|
||||
|
@ -626,8 +626,6 @@ static bool handcrafted_check_tensor_data(const gguf_context * gguf_ctx, const u
|
|||
|
||||
bool ok = true;
|
||||
|
||||
const uint32_t alignment = GGUF_DEFAULT_ALIGNMENT;
|
||||
|
||||
for (int i = 0; i < int(tensor_configs.size()); ++i) {
|
||||
const ggml_type type = tensor_configs[i].first;
|
||||
const std::array<int64_t, GGML_MAX_DIMS> shape = tensor_configs[i].second;
|
||||
|
@ -866,13 +864,13 @@ static struct random_gguf_context_result get_random_gguf_context(ggml_backend_t
|
|||
case GGUF_TYPE_COUNT:
|
||||
default: {
|
||||
GGML_ABORT("fatal error");
|
||||
} break;
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_COUNT:
|
||||
default: {
|
||||
GGML_ABORT("fatal error");
|
||||
} break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -938,7 +936,7 @@ static bool all_kv_in_other(const gguf_context * ctx, const gguf_context * other
|
|||
}
|
||||
|
||||
if (type == GGUF_TYPE_ARRAY) {
|
||||
const int arr_n = gguf_get_arr_n(ctx, id);
|
||||
const size_t arr_n = gguf_get_arr_n(ctx, id);
|
||||
if (arr_n != gguf_get_arr_n(other, idx_other)) {
|
||||
ok = false;
|
||||
continue;
|
||||
|
@ -953,7 +951,7 @@ static bool all_kv_in_other(const gguf_context * ctx, const gguf_context * other
|
|||
if (type_arr == GGUF_TYPE_BOOL) {
|
||||
const int8_t * data = reinterpret_cast<const int8_t *>(gguf_get_arr_data(ctx, id));
|
||||
const int8_t * data_other = reinterpret_cast<const int8_t *>(gguf_get_arr_data(other, idx_other));
|
||||
for (int arr_i = 0; arr_i < arr_n; ++arr_i) {
|
||||
for (size_t arr_i = 0; arr_i < arr_n; ++arr_i) {
|
||||
if (bool(data[arr_i]) != bool(data_other[arr_i])) {
|
||||
ok = false;
|
||||
}
|
||||
|
@ -962,7 +960,7 @@ static bool all_kv_in_other(const gguf_context * ctx, const gguf_context * other
|
|||
}
|
||||
|
||||
if (type_arr == GGUF_TYPE_STRING) {
|
||||
for (int arr_i = 0; arr_i < arr_n; ++arr_i) {
|
||||
for (size_t arr_i = 0; arr_i < arr_n; ++arr_i) {
|
||||
const std::string str = gguf_get_arr_str(ctx, id, arr_i);
|
||||
const std::string str_other = gguf_get_arr_str(other, idx_other, arr_i);
|
||||
if (str != str_other) {
|
||||
|
@ -1033,6 +1031,12 @@ static bool same_tensor_data(const struct ggml_context * orig, const struct ggml
|
|||
|
||||
struct ggml_tensor * t_orig = ggml_get_first_tensor(orig);
|
||||
struct ggml_tensor * t_read = ggml_get_first_tensor(read);
|
||||
|
||||
if (std::string(t_read->name) != "GGUF tensor data binary blob") {
|
||||
return false;
|
||||
}
|
||||
t_read = ggml_get_next_tensor(read, t_read);
|
||||
|
||||
while (t_orig) {
|
||||
if (!t_read) {
|
||||
ok = false;
|
||||
|
@ -1051,13 +1055,13 @@ static bool same_tensor_data(const struct ggml_context * orig, const struct ggml
|
|||
}
|
||||
|
||||
t_orig = ggml_get_next_tensor(orig, t_orig);
|
||||
t_read = ggml_get_next_tensor(orig, t_read);
|
||||
t_read = ggml_get_next_tensor(read, t_read);
|
||||
}
|
||||
if (t_read) {
|
||||
ok = false;
|
||||
}
|
||||
|
||||
return true;
|
||||
return ok;
|
||||
}
|
||||
|
||||
static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue