Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.github/workflows/build.yml
#	.github/workflows/docker.yml
#	Makefile
#	README-sycl.md
#	README.md
#	ci/run.sh
#	ggml-cuda.cu
#	ggml.c
#	grammars/README.md
#	scripts/get-wikitext-2.sh
#	scripts/hf.sh
#	scripts/sync-ggml.last
#	tests/test-backend-ops.cpp
#	tests/test-grammar-integration.cpp
#	tests/test-json-schema-to-grammar.cpp
This commit is contained in:
Concedo 2024-04-14 21:18:39 +08:00
commit 9a25d77cc1
58 changed files with 6529 additions and 6121 deletions

View file

@ -108,9 +108,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
// the top-k selected expert ids are stored in the ids tensor
// for simplicity, always copy ids to host, because it is small
// take into account that ids is not contiguous!
GGML_ASSERT(ids->ne[1] == src1->ne[1]);
GGML_ASSERT(n_as*ggml_nrows(ids)*sizeof(int) == GGML_PAD(ggml_nbytes(ids), n_as*sizeof(int)));
m_ids.resize(ggml_nbytes(ids)/sizeof(int));
ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
@ -350,12 +348,13 @@ static void process_logits(
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
const int n_ctx = llama_n_ctx(ctx);
auto tim1 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
@ -597,24 +596,18 @@ int main(int argc, char ** argv) {
llama_backend_init();
llama_numa_init(params.numa);
llama_model_params mparams = llama_model_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1;
}
llama_context_params cparams = llama_context_params_from_gpt_params(params);
// pass the callback to the backend scheduler
// it will be executed for each node during the graph computation
cparams.cb_eval = ik_collect_imatrix;
cparams.cb_eval_user_data = NULL;
params.cb_eval = ik_collect_imatrix;
params.cb_eval_user_data = NULL;
params.warmup = false;
llama_context * ctx = llama_new_context_with_model(model, cparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: unable to create context\n", __func__);
// init
llama_model * model;
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__);
return 1;
}