Merge branch 'master' into concedo_experimental

# Conflicts:
#	.github/workflows/build.yml
#	CMakeLists.txt
#	Makefile
#	README.md
#	pocs/vdot/q8dot.cpp
#	pocs/vdot/vdot.cpp
#	scripts/sync-ggml.sh
#	tests/test-grad0.c
#	tests/test-quantize-fns.cpp
#	tests/test-quantize-perf.cpp
This commit is contained in:
Concedo 2023-07-06 15:40:40 +08:00
commit 220aa707e6
18 changed files with 5694 additions and 1985 deletions

View file

@ -1158,6 +1158,7 @@ static void llama_model_load_internal(
}
}
#endif // GGML_USE_CUBLAS
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
@ -1166,6 +1167,10 @@ static void llama_model_load_internal(
fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
}
size_t vram_kv_cache = 0;
#ifdef GGML_USE_CUBLAS
const int max_backend_supported_layers = hparams.n_layer + 3;
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
if (n_gpu_layers > (int) hparams.n_layer + 1) {
if (low_vram) {
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
@ -1182,14 +1187,18 @@ static void llama_model_load_internal(
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
}
}
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
#elif defined(GGML_USE_CLBLAST)
const int max_backend_supported_layers = hparams.n_layer + 1;
const int max_offloadable_layers = hparams.n_layer + 1;
#endif // GGML_USE_CUBLAS
fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
__func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
__func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
#else
(void) n_gpu_layers;
#endif
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
}
// populate `tensors_by_name`
@ -1896,10 +1905,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
return;
}
const int64_t t_start_sample_us = ggml_time_us();
llama_sample_softmax(ctx, candidates);
const int64_t t_start_sample_us = ggml_time_us();
// Compute the cumulative probabilities
float cum_sum = 0.0f;
size_t last_idx = candidates->size;
@ -1928,9 +1937,8 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
return;
}
const int64_t t_start_sample_us = ggml_time_us();
llama_sample_softmax(nullptr, candidates);
const int64_t t_start_sample_us = ggml_time_us();
// Compute the first and second derivatives
std::vector<float> first_derivatives(candidates->size - 1);
@ -1982,11 +1990,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
return;
}
const int64_t t_start_sample_us = ggml_time_us();
// Compute the softmax of logits and calculate entropy
llama_sample_softmax(nullptr, candidates);
const int64_t t_start_sample_us = ggml_time_us();
float entropy = 0.0f;
for (size_t i = 0; i < candidates->size; ++i) {
entropy += -candidates->data[i].p * logf(candidates->data[i].p);
@ -2155,13 +2163,11 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
if (ctx) {
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
ctx->n_sample++;
}
return X;
}
llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
assert(ctx);
int64_t t_start_sample_us;
t_start_sample_us = ggml_time_us();
@ -2176,13 +2182,14 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
candidates->size = 1;
}
if (ctx) {
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
}
// Normalize the probabilities of the remaining words
llama_sample_softmax(ctx, candidates);
// Sample the next word X from the remaining words
if (ctx) {
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
}
llama_token X = llama_sample_token(ctx, candidates);
t_start_sample_us = ggml_time_us();
@ -2250,10 +2257,10 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
}
float * f32_output = (float *) output.addr;
quantize_fns_t qtype;
ggml_type_traits_t qtype;
if (ggml_is_quantized(tensor.type)) {
qtype = ggml_internal_get_quantize_fn(tensor.type);
if (qtype.dequantize_row_q == NULL) {
qtype = ggml_internal_get_type_traits(tensor.type);
if (qtype.to_float == NULL) {
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
}
} else if (tensor.type != GGML_TYPE_F16) {
@ -2264,7 +2271,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
if (tensor.type == GGML_TYPE_F16) {
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
} else if (ggml_is_quantized(tensor.type)) {
qtype.dequantize_row_q(tensor.data, f32_output, nelements);
qtype.to_float(tensor.data, f32_output, nelements);
} else {
LLAMA_ASSERT(false); // unreachable
}
@ -2289,7 +2296,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
if (typ == GGML_TYPE_F16) {
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
} else {
qtype.dequantize_row_q(inbuf, outbuf, nels);
qtype.to_float(inbuf, outbuf, nels);
}
};
workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
@ -3474,23 +3481,35 @@ llama_token llama_token_nl() {
return 13;
}
struct llama_timings llama_get_timings(struct llama_context * ctx) {
struct llama_timings result = {
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
/*.t_end_ms =*/ 1.00 * ggml_time_ms(),
/*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
/*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
/*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
/*.n_sample =*/ std::max(1, ctx->n_sample),
/*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
/*.n_eval =*/ std::max(1, ctx->n_eval),
};
return result;
}
void llama_print_timings(struct llama_context * ctx) {
const int64_t t_end_us = ggml_time_us();
const int32_t n_sample = std::max(1, ctx->n_sample);
const int32_t n_eval = std::max(1, ctx->n_eval);
const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
const llama_timings timings = llama_get_timings(ctx);
fprintf(stderr, "\n");
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval);
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
}
void llama_reset_timings(struct llama_context * ctx) {