Merge branch 'master' into concedo_experimental

# Conflicts:
#	CMakeLists.txt
#	Makefile
#	README.md
This commit is contained in:
Concedo 2023-06-19 22:28:38 +08:00
commit cf94340dfc
4 changed files with 148 additions and 77 deletions

View file

@ -1620,7 +1620,7 @@ static bool llama_eval_internal(
model.layers[il].w1,
cur);
offload_func(cur);
ggml_set_name(cur, "result_w2");
ggml_set_name(cur, "result_w1");
// SILU activation
cur = ggml_silu(ctx0, cur);
@ -1657,11 +1657,7 @@ static bool llama_eval_internal(
{
cur = ggml_rms_norm(ctx0, inpL);
offload_func_nr(cur);
ggml_set_name(cur, "rms_norm_inpL");
cur = ggml_rms_norm(ctx0, cur);
offload_func_nr(cur);
ggml_set_name(cur, "rms_norm_after");
ggml_set_name(cur, "rms_norm_2");
// cur = cur*norm(broadcasted)
cur = ggml_mul(ctx0, cur, model.norm);
@ -3471,9 +3467,12 @@ void llama_print_timings(struct llama_context * ctx) {
fprintf(stderr, "\n");
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval);
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
}