Add an independent profile tool

2025-09-04 00:09:08 +00:00 · 2025-04-18 03:27:20 +00:00 · 2025-04-18 03:27:20 +00:00 · cf4fa04980
commit cf4fa04980
parent 168c14f4e8
3 changed files with 69 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -67,6 +67,7 @@ autogen-*.md

 /main
 /server
+/profile-tool

 # CI

--- a/7
+++ b/7
@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = llama-cli
+BUILD_TARGETS = llama-cli profile-tool
 # BUILD_TARGETS = \
 # 	libllava.a \
 # 	llama-baby-llama \
@ -1528,6 +1528,11 @@ llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual

+profile-tool: tools/profile_tool.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 ifeq ($(UNAME_S),Darwin)
 swift: examples/batched.swift
 	(cd examples/batched.swift; make build)
--- a/tools/profile_tool.cpp
+++ b/tools/profile_tool.cpp
@ -0,0 +1,62 @@
+#include "arg.h"
+#include "common.h"
+#include "console.h"
+#include "log.h"
+#include "llama.h"
+
+static void print_usage(int argc, char ** argv) {
+    (void) argc;
+
+    LOG("\nexample usage:\n");
+    LOG("\n  text generation:     %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
+    LOG("\n  chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
+    LOG("\n");
+}
+
+int main(int argc, char ** argv) {
+    gpt_params params;
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
+        return 1;
+    }
+
+    if (params.n_ctx != 0 && params.n_ctx < 8) {
+        LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        params.n_ctx = 8;
+    }
+
+    if (params.rope_freq_base != 0.0) {
+        LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
+    }
+
+    if (params.rope_freq_scale != 0.0) {
+        LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
+    }
+
+    // load the model and apply lora adapter, if any
+    auto mparams = llama_model_params_from_gpt_params(params);
+    struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
+
+    struct llama_model * model = nullptr;
+
+    if (!params.hf_repo.empty() && !params.hf_file.empty()) {
+        model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
+    } else if (!params.model_url.empty()) {
+        model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
+    } else {
+        model = llama_load_model_from_file(params.model.c_str(), mparams);
+    }
+
+    if (model == NULL) {
+        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
+        return -1;
+    }
+
+    llama_model_loader * ml = llama_model_load(params.model.c_str(), model, &mparams);
+
+    device_info dev_info;
+    llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
+    device_print_props(&dev_info, 1, model, cparams);
+
+    llama_free_model(model);
+    return 0;
+}