diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml new file mode 100644 index 000000000..bc08a72d0 --- /dev/null +++ b/.github/workflows/close-issue.yml @@ -0,0 +1,22 @@ +name: Close inactive issues +on: + schedule: + - cron: "42 0 * * *" + +jobs: + close-issues: + runs-on: ubuntu-latest + permissions: + issues: write + pull-requests: write + steps: + - uses: actions/stale@v5 + with: + days-before-issue-stale: 30 + days-before-issue-close: 14 + stale-issue-label: "stale" + stale-issue-message: "This issue is stale because it has been open for 30 days with no activity." + close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." + days-before-pr-stale: -1 + days-before-pr-close: -1 + repo-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/common/common.cpp b/common/common.cpp index edd1efd67..fd09a76e1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -152,13 +152,17 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { std::replace(arg.begin(), arg.end(), '_', '-'); } + bool arg_found = false; if (arg == "-s" || arg == "--seed") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.seed = std::stoul(argv[i]); - } else if (arg == "-t" || arg == "--threads") { + } + if (arg == "-t" || arg == "--threads") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -167,7 +171,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (params.n_threads <= 0) { params.n_threads = std::thread::hardware_concurrency(); } - } else if (arg == "-tb" || arg == "--threads-batch") { + } + if (arg == "-tb" || arg == "--threads-batch") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -176,7 +182,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (params.n_threads_batch <= 0) { params.n_threads_batch = std::thread::hardware_concurrency(); } - } else if (arg == "-td" || arg == "--threads-draft") { + } + if (arg == "-td" || arg == "--threads-draft") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -185,7 +193,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (params.n_threads_draft <= 0) { params.n_threads_draft = std::thread::hardware_concurrency(); } - } else if (arg == "-tbd" || arg == "--threads-batch-draft") { + } + if (arg == "-tbd" || arg == "--threads-batch-draft") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -194,25 +204,37 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (params.n_threads_batch_draft <= 0) { params.n_threads_batch_draft = std::thread::hardware_concurrency(); } - } else if (arg == "-p" || arg == "--prompt") { + } + if (arg == "-p" || arg == "--prompt") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.prompt = argv[i]; - } else if (arg == "-e" || arg == "--escape") { + } + if (arg == "-e" || arg == "--escape") { + arg_found = true; params.escape = true; - } else if (arg == "--prompt-cache") { + } + if (arg == "--prompt-cache") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.path_prompt_cache = argv[i]; - } else if (arg == "--prompt-cache-all") { + } + if (arg == "--prompt-cache-all") { + arg_found = true; params.prompt_cache_all = true; - } else if (arg == "--prompt-cache-ro") { + } + if (arg == "--prompt-cache-ro") { + arg_found = true; params.prompt_cache_ro = true; - } else if (arg == "-bf" || arg == "--binary-file") { + } + if (arg == "-bf" || arg == "--binary-file") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -229,7 +251,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { ss << file.rdbuf(); params.prompt = ss.str(); fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]); - } else if (arg == "-f" || arg == "--file") { + } + if (arg == "-f" || arg == "--file") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -246,51 +270,67 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (!params.prompt.empty() && params.prompt.back() == '\n') { params.prompt.pop_back(); } - } else if (arg == "-n" || arg == "--n-predict") { + } + if (arg == "-n" || arg == "--n-predict") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_predict = std::stoi(argv[i]); - } else if (arg == "--top-k") { + } + if (arg == "--top-k") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.top_k = std::stoi(argv[i]); - } else if (arg == "-c" || arg == "--ctx-size") { + } + if (arg == "-c" || arg == "--ctx-size") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_ctx = std::stoi(argv[i]); - } else if (arg == "--grp-attn-n" || arg == "-gan") { + } + if (arg == "--grp-attn-n" || arg == "-gan") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.grp_attn_n = std::stoi(argv[i]); - } else if (arg == "--grp-attn-w" || arg == "-gaw") { + } + if (arg == "--grp-attn-w" || arg == "-gaw") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.grp_attn_w = std::stoi(argv[i]); - } else if (arg == "--rope-freq-base") { + } + if (arg == "--rope-freq-base") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.rope_freq_base = std::stof(argv[i]); - } else if (arg == "--rope-freq-scale") { + } + if (arg == "--rope-freq-scale") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.rope_freq_scale = std::stof(argv[i]); - } else if (arg == "--rope-scaling") { + } + if (arg == "--rope-scaling") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -300,43 +340,57 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } else { invalid_param = true; break; } - } else if (arg == "--rope-scale") { + } + if (arg == "--rope-scale") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.rope_freq_scale = 1.0f/std::stof(argv[i]); - } else if (arg == "--yarn-orig-ctx") { + } + if (arg == "--yarn-orig-ctx") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.yarn_orig_ctx = std::stoi(argv[i]); - } else if (arg == "--yarn-ext-factor") { + } + if (arg == "--yarn-ext-factor") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.yarn_ext_factor = std::stof(argv[i]); - } else if (arg == "--yarn-attn-factor") { + } + if (arg == "--yarn-attn-factor") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.yarn_attn_factor = std::stof(argv[i]); - } else if (arg == "--yarn-beta-fast") { + } + if (arg == "--yarn-beta-fast") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.yarn_beta_fast = std::stof(argv[i]); - } else if (arg == "--yarn-beta-slow") { + } + if (arg == "--yarn-beta-slow") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.yarn_beta_slow = std::stof(argv[i]); - } else if (arg == "--pooling") { + } + if (arg == "--pooling") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -346,118 +400,156 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } else { invalid_param = true; break; } - } else if (arg == "--defrag-thold" || arg == "-dt") { + } + if (arg == "--defrag-thold" || arg == "-dt") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.defrag_thold = std::stof(argv[i]); - } else if (arg == "--samplers") { + } + if (arg == "--samplers") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } const auto sampler_names = string_split(argv[i], ';'); sparams.samplers_sequence = sampler_types_from_names(sampler_names, true); - } else if (arg == "--sampling-seq") { + } + if (arg == "--sampling-seq") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.samplers_sequence = sampler_types_from_chars(argv[i]); - } else if (arg == "--top-p") { + } + if (arg == "--top-p") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.top_p = std::stof(argv[i]); - } else if (arg == "--min-p") { + } + if (arg == "--min-p") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.min_p = std::stof(argv[i]); - } else if (arg == "--temp") { + } + if (arg == "--temp") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.temp = std::stof(argv[i]); sparams.temp = std::max(sparams.temp, 0.0f); - } else if (arg == "--tfs") { + } + if (arg == "--tfs") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.tfs_z = std::stof(argv[i]); - } else if (arg == "--typical") { + } + if (arg == "--typical") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.typical_p = std::stof(argv[i]); - } else if (arg == "--repeat-last-n") { + } + if (arg == "--repeat-last-n") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.penalty_last_n = std::stoi(argv[i]); sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n); - } else if (arg == "--repeat-penalty") { + } + if (arg == "--repeat-penalty") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.penalty_repeat = std::stof(argv[i]); - } else if (arg == "--frequency-penalty") { + } + if (arg == "--frequency-penalty") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.penalty_freq = std::stof(argv[i]); - } else if (arg == "--presence-penalty") { + } + if (arg == "--presence-penalty") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.penalty_present = std::stof(argv[i]); - } else if (arg == "--dynatemp-range") { + } + if (arg == "--dynatemp-range") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.dynatemp_range = std::stof(argv[i]); - } else if (arg == "--dynatemp-exp") { + } + if (arg == "--dynatemp-exp") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.dynatemp_exponent = std::stof(argv[i]); - } else if (arg == "--mirostat") { + } + if (arg == "--mirostat") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.mirostat = std::stoi(argv[i]); - } else if (arg == "--mirostat-lr") { + } + if (arg == "--mirostat-lr") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.mirostat_eta = std::stof(argv[i]); - } else if (arg == "--mirostat-ent") { + } + if (arg == "--mirostat-ent") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.mirostat_tau = std::stof(argv[i]); - } else if (arg == "--cfg-negative-prompt") { + } + if (arg == "--cfg-negative-prompt") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.cfg_negative_prompt = argv[i]; - } else if (arg == "--cfg-negative-prompt-file") { + } + if (arg == "--cfg-negative-prompt-file") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -472,86 +564,114 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') { sparams.cfg_negative_prompt.pop_back(); } - } else if (arg == "--cfg-scale") { + } + if (arg == "--cfg-scale") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.cfg_scale = std::stof(argv[i]); - } else if (arg == "-b" || arg == "--batch-size") { + } + if (arg == "-b" || arg == "--batch-size") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_batch = std::stoi(argv[i]); - } else if (arg == "-ub" || arg == "--ubatch-size") { + } + if (arg == "-ub" || arg == "--ubatch-size") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_ubatch = std::stoi(argv[i]); - } else if (arg == "--keep") { + } + if (arg == "--keep") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_keep = std::stoi(argv[i]); - } else if (arg == "--draft") { + } + if (arg == "--draft") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_draft = std::stoi(argv[i]); - } else if (arg == "--chunks") { + } + if (arg == "--chunks") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_chunks = std::stoi(argv[i]); - } else if (arg == "-np" || arg == "--parallel") { + } + if (arg == "-np" || arg == "--parallel") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_parallel = std::stoi(argv[i]); - } else if (arg == "-ns" || arg == "--sequences") { + } + if (arg == "-ns" || arg == "--sequences") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_sequences = std::stoi(argv[i]); - } else if (arg == "--p-split" || arg == "-ps") { + } + if (arg == "--p-split" || arg == "-ps") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.p_split = std::stof(argv[i]); - } else if (arg == "-m" || arg == "--model") { + } + if (arg == "-m" || arg == "--model") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.model = argv[i]; - } else if (arg == "-md" || arg == "--model-draft") { + } + if (arg == "-md" || arg == "--model-draft") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.model_draft = argv[i]; - } else if (arg == "-a" || arg == "--alias") { + } + if (arg == "-a" || arg == "--alias") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.model_alias = argv[i]; - } else if (arg == "--lora") { + } + if (arg == "--lora") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.lora_adapter.emplace_back(argv[i], 1.0f); params.use_mmap = false; - } else if (arg == "--lora-scaled") { + } + if (arg == "--lora-scaled") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -563,55 +683,127 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); params.use_mmap = false; - } else if (arg == "--lora-base") { + } + if (arg == "--lora-base") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.lora_base = argv[i]; - } else if (arg == "--mmproj") { + } + if (arg == "--control-vector") { + arg_found = true; + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vectors.push_back({ 1.0f, argv[i], }); + } + if (arg == "--control-vector-scaled") { + arg_found = true; + if (++i >= argc) { + invalid_param = true; + break; + } + const char * fname = argv[i]; + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vectors.push_back({ std::stof(argv[i]), fname, }); + } + if (arg == "--control-vector-layer-range") { + arg_found = true; + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vector_layer_start = std::stoi(argv[i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vector_layer_end = std::stoi(argv[i]); + } + if (arg == "--mmproj") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.mmproj = argv[i]; - } else if (arg == "--image") { + } + if (arg == "--image") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.image = argv[i]; - } else if (arg == "-i" || arg == "--interactive") { + } + if (arg == "-i" || arg == "--interactive") { + arg_found = true; params.interactive = true; - } else if (arg == "--embedding") { + } + if (arg == "--embedding") { + arg_found = true; params.embedding = true; - } else if (arg == "--interactive-first") { + } + if (arg == "--interactive-first") { + arg_found = true; params.interactive_first = true; - } else if (arg == "-ins" || arg == "--instruct") { + } + if (arg == "-ins" || arg == "--instruct") { + arg_found = true; params.instruct = true; - } else if (arg == "-cml" || arg == "--chatml") { + } + if (arg == "-cml" || arg == "--chatml") { + arg_found = true; params.chatml = true; - } else if (arg == "--infill") { + } + if (arg == "--infill") { + arg_found = true; params.infill = true; - } else if (arg == "-dkvc" || arg == "--dump-kv-cache") { + } + if (arg == "-dkvc" || arg == "--dump-kv-cache") { + arg_found = true; params.dump_kv_cache = true; - } else if (arg == "-nkvo" || arg == "--no-kv-offload") { + } + if (arg == "-nkvo" || arg == "--no-kv-offload") { + arg_found = true; params.no_kv_offload = true; - } else if (arg == "-ctk" || arg == "--cache-type-k") { + } + if (arg == "-ctk" || arg == "--cache-type-k") { + arg_found = true; params.cache_type_k = argv[++i]; - } else if (arg == "-ctv" || arg == "--cache-type-v") { + } + if (arg == "-ctv" || arg == "--cache-type-v") { + arg_found = true; params.cache_type_v = argv[++i]; - } else if (arg == "--multiline-input") { + } + if (arg == "--multiline-input") { + arg_found = true; params.multiline_input = true; - } else if (arg == "--simple-io") { + } + if (arg == "--simple-io") { + arg_found = true; params.simple_io = true; - } else if (arg == "-cb" || arg == "--cont-batching") { + } + if (arg == "-cb" || arg == "--cont-batching") { + arg_found = true; params.cont_batching = true; - } else if (arg == "--color") { + } + if (arg == "--color") { + arg_found = true; params.use_color = true; - } else if (arg == "--mlock") { + } + if (arg == "--mlock") { + arg_found = true; params.use_mlock = true; - } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + } + if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -621,7 +813,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); } - } else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { + } + if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -631,7 +825,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); } - } else if (arg == "--main-gpu" || arg == "-mg") { + } + if (arg == "--main-gpu" || arg == "-mg") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -640,7 +836,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { #ifndef GGML_USE_CUBLAS_SYCL fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n"); #endif // GGML_USE_CUBLAS_SYCL - } else if (arg == "--split-mode" || arg == "-sm") { + } + if (arg == "--split-mode" || arg == "-sm") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -664,7 +862,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n"); #endif // GGML_USE_CUBLAS_SYCL - } else if (arg == "--tensor-split" || arg == "-ts") { + } + if (arg == "--tensor-split" || arg == "-ts") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -689,9 +889,13 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { #ifndef GGML_USE_CUBLAS_SYCL_VULKAN fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n"); #endif // GGML_USE_CUBLAS_SYCL - } else if (arg == "--no-mmap") { + } + if (arg == "--no-mmap") { + arg_found = true; params.use_mmap = false; - } else if (arg == "--numa") { + } + if (arg == "--numa") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -701,17 +905,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } else { invalid_param = true; break; } - } else if (arg == "--verbose-prompt") { + } + if (arg == "--verbose-prompt") { + arg_found = true; params.verbose_prompt = true; - } else if (arg == "--no-display-prompt") { + } + if (arg == "--no-display-prompt") { + arg_found = true; params.display_prompt = false; - } else if (arg == "-r" || arg == "--reverse-prompt") { + } + if (arg == "-r" || arg == "--reverse-prompt") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.antiprompt.emplace_back(argv[i]); - } else if (arg == "-ld" || arg == "--logdir") { + } + if (arg == "-ld" || arg == "--logdir") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -721,63 +933,93 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (params.logdir.back() != DIRECTORY_SEPARATOR) { params.logdir += DIRECTORY_SEPARATOR; } - } else if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { + } + if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.logits_file = argv[i]; - } else if (arg == "--perplexity" || arg == "--all-logits") { + } + if (arg == "--perplexity" || arg == "--all-logits") { + arg_found = true; params.logits_all = true; - } else if (arg == "--ppl-stride") { + } + if (arg == "--ppl-stride") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.ppl_stride = std::stoi(argv[i]); - } else if (arg == "-ptc" || arg == "--print-token-count") { + } + if (arg == "-ptc" || arg == "--print-token-count") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.n_print = std::stoi(argv[i]); - } else if (arg == "--ppl-output-type") { + } + if (arg == "--ppl-output-type") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.ppl_output_type = std::stoi(argv[i]); - } else if (arg == "--hellaswag") { + } + if (arg == "--hellaswag") { + arg_found = true; params.hellaswag = true; - } else if (arg == "--hellaswag-tasks") { + } + if (arg == "--hellaswag-tasks") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.hellaswag_tasks = std::stoi(argv[i]); - } else if (arg == "--winogrande") { + } + if (arg == "--winogrande") { + arg_found = true; params.winogrande = true; - } else if (arg == "--winogrande-tasks") { + } + if (arg == "--winogrande-tasks") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.winogrande_tasks = std::stoi(argv[i]); - } else if (arg == "--multiple-choice") { + } + if (arg == "--multiple-choice") { + arg_found = true; params.multiple_choice = true; - } else if (arg == "--multiple-choice-tasks") { + } + if (arg == "--multiple-choice-tasks") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.multiple_choice_tasks = std::stoi(argv[i]); - } else if (arg == "--kl-divergence") { + } + if (arg == "--kl-divergence") { + arg_found = true; params.kl_divergence = true; - } else if (arg == "--ignore-eos") { + } + if (arg == "--ignore-eos") { + arg_found = true; params.ignore_eos = true; - } else if (arg == "--no-penalize-nl") { + } + if (arg == "--no-penalize-nl") { + arg_found = true; sparams.penalize_nl = false; - } else if (arg == "-l" || arg == "--logit-bias") { + } + if (arg == "-l" || arg == "--logit-bias") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -796,36 +1038,51 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - } else if (arg == "-h" || arg == "--help") { + } + if (arg == "-h" || arg == "--help") { + arg_found = true; return false; - - } else if (arg == "--version") { + } + if (arg == "--version") { + arg_found = true; fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); exit(0); - } else if (arg == "--random-prompt") { + } + if (arg == "--random-prompt") { + arg_found = true; params.random_prompt = true; - } else if (arg == "--in-prefix-bos") { + } + if (arg == "--in-prefix-bos") { + arg_found = true; params.input_prefix_bos = true; - } else if (arg == "--in-prefix") { + } + if (arg == "--in-prefix") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.input_prefix = argv[i]; - } else if (arg == "--in-suffix") { + } + if (arg == "--in-suffix") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } params.input_suffix = argv[i]; - } else if (arg == "--grammar") { + } + if (arg == "--grammar") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; } sparams.grammar = argv[i]; - } else if (arg == "--grammar-file") { + } + if (arg == "--grammar-file") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -841,7 +1098,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { std::istreambuf_iterator(), std::back_inserter(sparams.grammar) ); - } else if (arg == "--override-kv") { + } + if (arg == "--override-kv") { + arg_found = true; if (++i >= argc) { invalid_param = true; break; @@ -884,10 +1143,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.kv_overrides.push_back(kvo); #ifndef LOG_DISABLE_LOGS // Parse args for logging parameters - } else if ( log_param_single_parse( argv[i] ) ) { + } + if ( log_param_single_parse( argv[i] ) ) { + arg_found = true; // Do nothing, log_param_single_parse automatically does it's thing // and returns if a match was found and parsed. - } else if ( log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i] ) ) { + } + if ( log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i] ) ) { + arg_found = true; // We have a matching known parameter requiring an argument, // now we need to check if there is anything after this argv // and flag invalid_param or parse it. @@ -901,7 +1164,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } // End of Parse args for logging parameters #endif // LOG_DISABLE_LOGS - } else { + } + + if (!arg_found) { throw std::invalid_argument("error: unknown argument: " + arg); } } @@ -1096,6 +1361,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n"); printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + printf(" --control-vector FNAME\n"); + printf(" add a control vector\n"); + printf(" --control-vector-scaled FNAME S\n"); + printf(" add a control vector with user defined scaling S\n"); + printf(" --control-vector-layer-range START END\n"); + printf(" layer range to apply the control vector(s) to, start and end inclusive\n"); printf(" -m FNAME, --model FNAME\n"); printf(" model path (default: %s)\n", params.model.c_str()); printf(" -md FNAME, --model-draft FNAME\n"); @@ -1361,6 +1632,30 @@ std::tuple llama_init_from_gpt_par return std::make_tuple(nullptr, nullptr); } + if (!params.control_vectors.empty()) { + if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1; + if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model); + + const auto cvec = llama_control_vector_load(params.control_vectors); + if (cvec.n_embd == -1) { + llama_free(lctx); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); + } + + int err = llama_control_vector_apply(lctx, + cvec.data.data(), + cvec.data.size(), + cvec.n_embd, + params.control_vector_layer_start, + params.control_vector_layer_end); + if (err) { + llama_free(lctx); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); + } + } + for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]); float lora_scale = std::get<1>(params.lora_adapter[i]); @@ -1891,3 +2186,160 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n) return sum / (sqrt(sum1) * sqrt(sum2)); } + +// +// Control vector utils +// + +static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) { + int32_t n_tensors; + + size_t n_bytes = 0; + + uint32_t max_direction_layer = 0; + + llama_control_vector_data result = { -1, {} }; + + // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer + { + struct ggml_init_params meta_params = { + /* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(), + /* .mem_buffer = */ nullptr, + /* .no_alloc = */ true, + }; + ggml_context * meta_ctx = ggml_init(meta_params); + struct gguf_init_params meta_gguf_params = { + /* .no_alloc = */ true, + /* .ctx = */ &meta_ctx, + }; + struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params); + if (!meta_ctx_gguf) { + fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str()); + ggml_free(meta_ctx); + return result; + } + + n_tensors = gguf_get_n_tensors(meta_ctx_gguf); + for (int i = 0; i < n_tensors; i++) { + std::string name = gguf_get_tensor_name(meta_ctx_gguf, i); + + // split on '.' + size_t dotpos = name.find('.'); + if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") { + try { + uint32_t layer = std::stoi(name.substr(dotpos + 1)); + if (layer == 0) { + fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return result; + } + if (layer > max_direction_layer) { + max_direction_layer = layer; + } + } catch (...) { + fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return result; + } + } + + struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str()); + if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) { + fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return result; + } + if (result.n_embd == -1) { + result.n_embd = ggml_nelements(tensor_meta); + } else if (ggml_nelements(tensor_meta) != result.n_embd) { + fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return result; + } + n_bytes += ggml_nbytes(tensor_meta); + } + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + } + + if (n_tensors == 0) { + fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str()); + return result; + } + + // load and scale tensors into final control vector context + struct ggml_init_params ggml_params = { + /* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes, + /* .mem_buffer = */ nullptr, + /* .no_alloc = */ false, + }; + struct ggml_context * ctx = ggml_init(ggml_params); + + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ &ctx, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params); + if (!ctx_gguf) { + fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str()); + ggml_free(ctx); + return result; + } + + // do not store data for layer 0 (it's not used) + result.data.resize(result.n_embd * max_direction_layer); + + for (uint32_t il = 1; il <= max_direction_layer; il++) { + const std::string name = "direction." + std::to_string(il); + const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); + + float * dst = result.data.data() + result.n_embd * (il - 1); + + if (tensor) { + const float * src = (const float *) tensor->data; + for (int j = 0; j < result.n_embd; j++) { + dst[j] = src[j] * load_info.strength; + } + } else { + for (int j = 0; j < result.n_embd; j++) { + dst[j] = 0.0f; + } + } + } + + return result; +} + +llama_control_vector_data llama_control_vector_load(const std::vector & load_infos) { + llama_control_vector_data result = { -1, {} }; + + for (const auto & info : load_infos) { + auto cur = llama_control_vector_load_one(info); + + if (cur.n_embd == -1) { + return result; + } + if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) { + fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str()); + return result; + } + + if (result.n_embd == -1) { + result = std::move(cur); + } else { + for (size_t i = 0; i < cur.data.size(); i++) { + result.data[i] += cur.data[i]; + } + } + } + + if (result.n_embd == -1) { + fprintf(stderr, "%s: no vectors passed\n", __func__); + } + + return result; +} diff --git a/common/common.h b/common/common.h index 7484bf1d7..5ee0d2c17 100644 --- a/common/common.h +++ b/common/common.h @@ -31,10 +31,15 @@ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \ } while(0) +// build info + +struct llama_control_vector_load_info; + +int32_t get_num_physical_cores(); + // // CLI argument parsing // -int32_t get_num_physical_cores(); struct gpt_params { uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed @@ -117,6 +122,11 @@ struct gpt_params { std::vector> lora_adapter; // lora adapter path with user defined scale std::string lora_base = ""; // base model path for the lora adapter + std::vector control_vectors; // control vector with user defined scale + + int32_t control_vector_layer_start = -1; // layer range for control vector + int32_t control_vector_layer_end = -1; // layer range for control vector + int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line // (which is more convenient to use for plotting) @@ -283,3 +293,24 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40 void llama_embd_normalize(const float * inp, float * out, int n); float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n); + +// +// Control vector utils +// + +struct llama_control_vector_data { + int n_embd; + + // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd + std::vector data; +}; + +struct llama_control_vector_load_info { + float strength; + + std::string fname; +}; + +// Load control vectors, scale each by strength, and add them together. +// On error, returns {-1, empty} +llama_control_vector_data llama_control_vector_load(const std::vector & load_infos); diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 5eee32016..cf1f98d66 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1965,6 +1965,23 @@ class MambaModel(Model): self.gguf_writer.add_tensor(new_name, data) +@Model.register("CohereForCausalLM") +class CommandR2Model(Model): + model_arch = gguf.MODEL_ARCH.COMMAND_R + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # max_position_embeddings = 8192 in config.json but model was actually + # trained on 128k context length + self.hparams["max_position_embeddings"] = self.hparams["model_max_length"] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_logit_scale(self.hparams["logit_scale"]) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + ###### CONVERSION LOGIC ###### diff --git a/examples/gritlm/README.md b/examples/gritlm/README.md new file mode 100644 index 000000000..64cc19204 --- /dev/null +++ b/examples/gritlm/README.md @@ -0,0 +1,62 @@ +## Generative Representational Instruction Tuning (GRIT) Example +[gritlm] a model which can generate embeddings as well as "normal" text +generation depending on the instructions in the prompt. + +* Paper: https://arxiv.org/pdf/2402.09906.pdf + +### Retrieval-Augmented Generation (RAG) use case +One use case for `gritlm` is to use it with RAG. If we recall how RAG works is +that we take documents that we want to use as context, to ground the large +language model (LLM), and we create token embeddings for them. We then store +these token embeddings in a vector database. + +When we perform a query, prompt the LLM, we will first create token embeddings +for the query and then search the vector database to retrieve the most +similar vectors, and return those documents so they can be passed to the LLM as +context. Then the query and the context will be passed to the LLM which will +have to _again_ create token embeddings for the query. But because gritlm is used +the first query can be cached and the second query tokenization generation does +not have to be performed at all. + +### Running the example +Download a Grit model: +```console +$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf +``` + +Run the example using the downloaded model: +```console +$ ./gritlm -m gritlm-7b_q4_1.gguf + +Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605 +Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103 +Cosine similarity between "Generative Representational Instruction Tuning" and "A purely peer-to-peer version of electronic cash w" is: 0.112 +Cosine similarity between "Generative Representational Instruction Tuning" and "All text-based language problems can be reduced to" is: 0.547 + +Oh, brave adventurer, who dared to climb +The lofty peak of Mt. Fuji in the night, +When shadows lurk and ghosts do roam, +And darkness reigns, a fearsome sight. + +Thou didst set out, with heart aglow, +To conquer this mountain, so high, +And reach the summit, where the stars do glow, +And the moon shines bright, up in the sky. + +Through the mist and fog, thou didst press on, +With steadfast courage, and a steadfast will, +Through the darkness, thou didst not be gone, +But didst climb on, with a steadfast skill. + +At last, thou didst reach the summit's crest, +And gazed upon the world below, +And saw the beauty of the night's best, +And felt the peace, that only nature knows. + +Oh, brave adventurer, who dared to climb +The lofty peak of Mt. Fuji in the night, +Thou art a hero, in the eyes of all, +For thou didst conquer this mountain, so bright. +``` + +[gritlm]: https://github.com/ContextualAI/gritlm diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 1a4af009e..9f7e9fdad 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -1124,15 +1125,19 @@ struct sql_printer : public printer { static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) { llama_set_n_threads(ctx, n_threads, n_threads); - //std::vector tokens(n_prompt, llama_token_bos(llama_get_model(ctx))); - //llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt, n_past, 0)); - //GGML_UNUSED(n_batch); + const llama_model * model = llama_get_model(ctx); + const int32_t n_vocab = llama_n_vocab(model); + + std::vector tokens(n_batch); - std::vector tokens(n_batch, llama_token_bos(llama_get_model(ctx))); int n_processed = 0; while (n_processed < n_prompt) { int n_tokens = std::min(n_prompt - n_processed, n_batch); + tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; + for (int i = 1; i < n_tokens; i++) { + tokens[i] = std::rand() % n_vocab; + } llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0)); n_processed += n_tokens; } @@ -1143,11 +1148,15 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) { llama_set_n_threads(ctx, n_threads, n_threads); - llama_token token = llama_token_bos(llama_get_model(ctx)); + const llama_model * model = llama_get_model(ctx); + const int32_t n_vocab = llama_n_vocab(model); + + llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; for (int i = 0; i < n_gen; i++) { llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0)); llama_synchronize(ctx); + token = std::rand() % n_vocab; } } diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 2035554ea..a0ed82d7e 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1235,16 +1235,16 @@ struct clip_image_f32 * clip_image_f32_init() { void clip_image_u8_free(struct clip_image_u8 * img) { delete img; } void clip_image_f32_free(struct clip_image_f32 * img) { delete img; } -void clip_image_u8_batch_free(struct clip_image_u8_batch & batch) { - if (batch.size > 0) { - delete[] batch.data; - batch.size = 0; +void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { + if (batch->size > 0) { + delete[] batch->data; + batch->size = 0; } } -void clip_image_f32_batch_free(struct clip_image_f32_batch & batch) { - if (batch.size > 0) { - delete[] batch.data; - batch.size = 0; +void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { + if (batch->size > 0) { + delete[] batch->data; + batch->size = 0; } } @@ -1497,7 +1497,7 @@ static std::vector divide_to_patches_u8(const clip_image_u8 & im // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector // res_imgs memory is being allocated here, previous allocations will be freed if found -bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs) { +bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) { bool pad_to_square = true; if (!ctx->has_vision_encoder) { printf("This gguf file seems to have no vision encoder\n"); @@ -1509,11 +1509,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli pad_to_square = false; } // free the previous res_imgs if any set - if (res_imgs.size > 0) { + if (res_imgs->size > 0) { clip_image_f32_batch_free(res_imgs); } - res_imgs.data = nullptr; - res_imgs.size = 0; + res_imgs->data = nullptr; + res_imgs->size = 0; // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 @@ -1568,11 +1568,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square patches.insert(patches.begin(), image_original_resize); // clip_image_f32_batch_init(patches.size()); - res_imgs.size = patches.size(); - res_imgs.data = new clip_image_f32[res_imgs.size]; + res_imgs->size = patches.size(); + res_imgs->data = new clip_image_f32[res_imgs->size]; int num=0; for (auto& patch : patches) { - normalize_image_u8_to_f32(patch, &res_imgs.data[num], ctx->image_mean, ctx->image_std); + normalize_image_u8_to_f32(patch, &res_imgs->data[num], ctx->image_mean, ctx->image_std); num++; } @@ -1660,9 +1660,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli // } // res_imgs.push_back(res); - res_imgs.size = 1; - res_imgs.data = new clip_image_f32[res_imgs.size]; - res_imgs.data[0] = *res; + res_imgs->size = 1; + res_imgs->data = new clip_image_f32[res_imgs->size]; + res_imgs->data[0] = *res; clip_image_f32_free(res); return true; diff --git a/examples/llava/clip.h b/examples/llava/clip.h index e5bd54924..45bdad689 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -60,8 +60,8 @@ CLIP_API struct clip_image_f32 * clip_image_f32_init(); CLIP_API void clip_image_u8_free (struct clip_image_u8 * img); CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); -CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch & batch); -CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch & batch); +CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch); +CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch); CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); @@ -69,7 +69,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); /** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */ -CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs ); +CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs ); CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 980128166..29764757a 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -223,7 +223,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli clip_image_f32_batch img_res_v; img_res_v.size = 0; img_res_v.data = nullptr; - if (!clip_image_preprocess(ctx_clip, img, img_res_v)) { + if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) { fprintf(stderr, "%s: unable to preprocess image\n", __func__); delete[] img_res_v.data; return false; diff --git a/examples/llava/llava.h b/examples/llava/llava.h index 2d40f3f1d..19212f6e9 100644 --- a/examples/llava/llava.h +++ b/examples/llava/llava.h @@ -29,9 +29,9 @@ struct llava_image_embed { }; /** sanity check for clip <-> llava embed size match */ -LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip); +LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip); -LLAVA_API bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out); +LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out); /** build an image embed from image file bytes */ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length); diff --git a/examples/sycl/build.sh b/examples/sycl/build.sh index 26ad2f7da..f20391d7a 100755 --- a/examples/sycl/build.sh +++ b/examples/sycl/build.sh @@ -13,8 +13,11 @@ source /opt/intel/oneapi/setvars.sh #for FP32 cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -#build example/main only +#build example/main #cmake --build . --config Release --target main +#build example/llama-bench +#cmake --build . --config Release --target llama-bench + #build all binary cmake --build . --config Release -v diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh index 52f7c01a4..c979a52f6 100755 --- a/examples/sycl/run-llama2.sh +++ b/examples/sycl/run-llama2.sh @@ -9,18 +9,28 @@ source /opt/intel/oneapi/setvars.sh if [ $# -gt 0 ]; then GGML_SYCL_DEVICE=$1 + GGML_SYCL_SINGLE_GPU=1 else GGML_SYCL_DEVICE=0 fi -echo "use $GGML_SYCL_DEVICE as main GPU" + #export GGML_SYCL_DEBUG=1 #ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer. -#use all GPUs with same max compute units -ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 +if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then + echo "use $GGML_SYCL_DEVICE as main GPU" + #use signle GPU only + ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none +else + #use multiple GPUs with same max compute units + ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 +fi #use main GPU only #ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none +#use multiple GPUs with same max compute units +#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 + diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 2ca5ce88a..2cd6ede39 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -11539,6 +11539,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev if (ggml_backend_is_cuda(event->backend)) { CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0)); } else { +#if 0 // untested auto wait_fn = [](void * user_data) { ggml_backend_event_t event = (ggml_backend_event_t)user_data; @@ -11546,6 +11547,8 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev }; CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event)); +#endif + GGML_ASSERT(false); } } diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 9f6506383..6dc5eb20c 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -24,10 +25,9 @@ #include #include #include - #include #include - +#include #include #include @@ -82,6 +82,30 @@ Following definition copied from DPCT head files, which are used by ggml-sycl.cp #define __dpct_noinline__ __attribute__((noinline)) #endif + +std::string get_device_type_name(const sycl::device &Device) { + auto DeviceType = Device.get_info(); + switch (DeviceType) { + case sycl::info::device_type::cpu: + return "cpu"; + case sycl::info::device_type::gpu: + return "gpu"; + case sycl::info::device_type::host: + return "host"; + case sycl::info::device_type::accelerator: + return "acc"; + default: + return "unknown"; + } +} + +std::string get_device_backend_and_type(const sycl::device &device) { + std::stringstream device_type; + sycl::backend backend = device.get_backend(); + device_type << backend << ":" << get_device_type_name(device); + return device_type.str(); +} + namespace dpct { typedef sycl::queue *queue_ptr; @@ -942,17 +966,65 @@ namespace dpct private: mutable std::recursive_mutex m_mutex; + static bool compare_dev(sycl::device &device1, sycl::device &device2) + { + dpct::device_info prop1; + dpct::get_device_info(prop1, device1); + dpct::device_info prop2; + dpct::get_device_info(prop2, device2); + return prop1.get_max_compute_units() > prop2.get_max_compute_units(); + } + static int convert_backend_index(std::string & backend) { + if (backend == "ext_oneapi_level_zero:gpu") return 0; + if (backend == "opencl:gpu") return 1; + if (backend == "opencl:cpu") return 2; + if (backend == "opencl:acc") return 3; + printf("convert_backend_index: can't handle backend=%s\n", backend.c_str()); + GGML_ASSERT(false); + } + static bool compare_backend(std::string &backend1, std::string &backend2) { + return convert_backend_index(backend1) < convert_backend_index(backend2); + } dev_mgr() { sycl::device default_device = sycl::device(sycl::default_selector_v); _devs.push_back(std::make_shared(default_device)); - std::vector sycl_all_devs = - sycl::device::get_devices(sycl::info::device_type::all); + std::vector sycl_all_devs; // Collect other devices except for the default device. if (default_device.is_cpu()) _cpu_device = 0; + + auto Platforms = sycl::platform::get_platforms(); + // Keep track of the number of devices per backend + std::map DeviceNums; + std::map> backend_devices; + + while (!Platforms.empty()) { + auto Platform = Platforms.back(); + Platforms.pop_back(); + auto devices = Platform.get_devices(); + std::string backend_type = get_device_backend_and_type(devices[0]); + for (const auto &device : devices) { + backend_devices[backend_type].push_back(device); + } + } + + std::vector keys; + for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) { + keys.push_back(it->first); + } + std::sort(keys.begin(), keys.end(), compare_backend); + + for (auto &key : keys) { + std::vector devs = backend_devices[key]; + std::sort(devs.begin(), devs.end(), compare_dev); + for (const auto &dev : devs) { + sycl_all_devs.push_back(dev); + } + } + for (auto &dev : sycl_all_devs) { if (dev == default_device) @@ -3202,6 +3274,11 @@ static int g_work_group_size = 0; #define GGML_SYCL_MMV_Y 1 #endif +enum ggml_sycl_backend_gpu_mode { + SYCL_UNSET_GPU_MODE = -1, + SYCL_SINGLE_GPU_MODE = 0, + SYCL_MUL_GPU_MODE +}; static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size"); @@ -3401,12 +3478,31 @@ class sycl_gpu_mgr { int work_group_size = 0; std::string gpus_list = ""; + /* + Use all GPUs with same top max compute units + */ sycl_gpu_mgr() { detect_sycl_gpu_list_with_max_cu(); get_allow_gpus(); create_context_with_gpus(); } + /* + Only use the assigned GPU + */ + sycl_gpu_mgr(int main_gpu_id) { + sycl::device device = dpct::dev_mgr::instance().get_device(main_gpu_id); + dpct::device_info prop; + dpct::get_device_info(prop, device); + gpus.push_back(main_gpu_id); + devices.push_back(device); + work_group_size = prop.get_max_work_group_size(); + max_compute_units = prop.get_max_compute_units(); + + get_allow_gpus(); + create_context_with_gpus(); + } + void create_context_with_gpus() { sycl::context ctx = sycl::context(devices); assert(gpus.size() > 0); @@ -3422,7 +3518,7 @@ class sycl_gpu_mgr { gpus_list += std::to_string(gpus[i]); gpus_list += ","; } - if (gpus_list.length() > 2) { + if (gpus_list.length() > 1) { gpus_list.pop_back(); } } @@ -3451,7 +3547,7 @@ class sycl_gpu_mgr { dpct::device_info prop; dpct::get_device_info(prop, device); if (max_compute_units == prop.get_max_compute_units() && - prop.get_major_version() == 1) { + is_ext_oneapi_device(device)) { gpus.push_back(id); devices.push_back(device); work_group_size = prop.get_max_work_group_size(); @@ -3471,8 +3567,8 @@ class sycl_gpu_mgr { if (gpus[i] == id) return i; } - assert(false); - return -1; + printf("miss to get device index by id=%d\n", id); + GGML_ASSERT(false); } int get_next_index(int id) { @@ -3481,8 +3577,16 @@ class sycl_gpu_mgr { if (gpus[i] == id) return i; } - assert(false); - return -1; + GGML_ASSERT(false); + } + + bool is_ext_oneapi_device(const sycl::device &dev) { + sycl::backend dev_backend = dev.get_backend(); + if (dev_backend == sycl::backend::ext_oneapi_level_zero || + dev_backend == sycl::backend::ext_oneapi_cuda || + dev_backend == sycl::backend::ext_oneapi_hip) + return true; + return false; } }; @@ -3491,11 +3595,14 @@ static int g_device_count = -1; static int g_all_sycl_device_count = -1; static int g_main_device = -1; static int g_main_device_id = -1; +static bool g_ggml_backend_sycl_buffer_type_initialized = false; static std::array g_default_tensor_split = {}; static float g_tensor_split[GGML_SYCL_MAX_DEVICES] = {0}; +static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode = SYCL_UNSET_GPU_MODE; + struct sycl_device_capabilities { int cc; // compute capability bool vmm; // virtual memory support @@ -12999,17 +13106,20 @@ bool ggml_sycl_loaded(void) { return g_sycl_loaded; } -void print_device_detail(int id) { +void print_device_detail(int id, sycl::device &device, std::string device_type) { + dpct::device_info prop; SYCL_CHECK(CHECK_TRY_ERROR( - dpct::get_device_info(prop, dpct::dev_mgr::instance().get_device(id)))); - sycl::device cur_device = dpct::dev_mgr::instance().get_device(id); + dpct::get_device_info(prop, device))); + std::string version; version += std::to_string(prop.get_major_version()); version += "."; version += std::to_string(prop.get_minor_version()); - fprintf(stderr, "|%2d|%45s|%18s|%17d|%14d|%13d|%15lu|\n", id, + device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), ""); + + fprintf(stderr, "|%2d|%18s|%45s|%10s|%11d|%8d|%7d|%15lu|\n", id, device_type.c_str(), prop.get_name(), version.c_str(), prop.get_max_compute_units(), prop.get_max_work_group_size(), prop.get_max_sub_group_size(), prop.get_global_mem_size()); @@ -13017,19 +13127,35 @@ void print_device_detail(int id) { void ggml_backend_sycl_print_sycl_devices() { int device_count = dpct::dev_mgr::instance().device_count(); + std::map DeviceNums; fprintf(stderr, "found %d SYCL devices:\n", device_count); - fprintf(stderr, "|ID| Name |compute capability|Max compute units|Max work group|Max sub group|Global mem size|\n"); - fprintf(stderr, "|--|---------------------------------------------|------------------|-----------------|--------------|-------------|---------------|\n"); + fprintf(stderr, "| | | |Compute |Max compute|Max work|Max sub| |\n"); + fprintf(stderr, "|ID| Device Type| Name|capability|units |group |group |Global mem size|\n"); + fprintf(stderr, "|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|\n"); for (int id = 0; id < device_count; ++id) { - print_device_detail(id); + sycl::device device = dpct::dev_mgr::instance().get_device(id); + sycl::backend backend = device.get_backend(); + std::string backend_type = get_device_backend_and_type(device); + int type_id=DeviceNums[backend_type]++; + std::stringstream device_type; + device_type << "[" << backend_type << ":" << std::to_string(type_id) << "]"; + print_device_detail(id, device, device_type.str()); } } void print_gpu_device_list() { - fprintf(stderr, "detect %d SYCL GPUs: [%s] with Max compute units:%d\n", - g_sycl_gpu_mgr->get_gpu_count(), - g_sycl_gpu_mgr->gpus_list.c_str(), - g_sycl_gpu_mgr->max_compute_units); + GGML_ASSERT(g_sycl_gpu_mgr); + + char* hint=NULL; + if (g_ggml_sycl_backend_gpu_mode == SYCL_SINGLE_GPU_MODE) { + hint = "use %d SYCL GPUs: [%s] with Max compute units:%d\n"; + } else { + hint = "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n"; + } + fprintf(stderr, hint, + g_sycl_gpu_mgr->get_gpu_count(), + g_sycl_gpu_mgr->gpus_list.c_str(), + g_sycl_gpu_mgr->max_compute_units); } int get_sycl_env(const char *env_name, int default_val) { @@ -13065,23 +13191,6 @@ void ggml_init_sycl() try { #else fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__); #endif - if (CHECK_TRY_ERROR(g_all_sycl_device_count = - dpct::dev_mgr::instance().device_count()) != 0) { - initialized = true; - g_sycl_loaded = false; - return; - } - GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES); - ggml_backend_sycl_print_sycl_devices(); - - if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr(); - - g_device_count = g_sycl_gpu_mgr->get_gpu_count(); - g_work_group_size = g_sycl_gpu_mgr->work_group_size; - - print_gpu_device_list(); - - int64_t total_vram = 0; /* NOT REMOVE, keep it for next optimize for XMX. #if defined(SYCL_USE_XMX) @@ -13090,49 +13199,15 @@ void ggml_init_sycl() try { fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__); #endif */ - for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) { - g_device_caps[id].vmm = 0; - g_device_caps[id].device_id = -1; - g_device_caps[id].cc = 0; - g_tensor_split[id] = 0; - g_default_tensor_split[id] = 0; + + if (CHECK_TRY_ERROR(g_all_sycl_device_count = + dpct::dev_mgr::instance().device_count()) != 0) { + initialized = true; + g_sycl_loaded = false; + return; } - - for (int i = 0; i < g_device_count; ++i) { - int device_id = g_sycl_gpu_mgr->gpus[i]; - g_device_caps[i].vmm = 0; - - dpct::device_info prop; - SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( - prop, dpct::dev_mgr::instance().get_device(device_id)))); - - g_default_tensor_split[i] = total_vram; - total_vram += prop.get_global_mem_size(); - - g_device_caps[i].cc = - 100 * prop.get_major_version() + 10 * prop.get_minor_version(); - } - - for (int i = 0; i < g_device_count; ++i) { - g_default_tensor_split[i] /= total_vram; - } - - for (int i = 0; i < g_device_count; ++i) { - SYCL_CHECK(ggml_sycl_set_device(i)); - - // create sycl streams - for (int is = 0; is < MAX_STREAMS; ++is) { - SYCL_CHECK(CHECK_TRY_ERROR( - g_syclStreams[i][is] = - dpct::get_current_device().create_queue( - g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device()))); - } - - const dpct::queue_ptr stream = g_syclStreams[i][0]; - // create sycl handle - SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream)); - } - + GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES); + ggml_backend_sycl_print_sycl_devices(); initialized = true; g_sycl_loaded = true; } @@ -13143,6 +13218,63 @@ catch (sycl::exception const &exc) { std::exit(1); } +void ggml_init_by_gpus(int device_count) try { + g_device_count = device_count; + g_work_group_size = g_sycl_gpu_mgr->work_group_size; + + int64_t total_vram = 0; + + print_gpu_device_list(); + + for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) { + g_device_caps[id].vmm = 0; + g_device_caps[id].device_id = -1; + g_device_caps[id].cc = 0; + g_tensor_split[id] = 0; + g_default_tensor_split[id] = 0; + } + + for (int i = 0; i < g_device_count; ++i) { + int device_id = g_sycl_gpu_mgr->gpus[i]; + g_device_caps[i].vmm = 0; + + dpct::device_info prop; + SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( + prop, dpct::dev_mgr::instance().get_device(device_id)))); + + g_default_tensor_split[i] = total_vram; + total_vram += prop.get_global_mem_size(); + + g_device_caps[i].cc = + 100 * prop.get_major_version() + 10 * prop.get_minor_version(); + } + + for (int i = 0; i < g_device_count; ++i) { + g_default_tensor_split[i] /= total_vram; + } + + for (int i = 0; i < g_device_count; ++i) { + SYCL_CHECK(ggml_sycl_set_device(i)); + + // create sycl streams + for (int is = 0; is < MAX_STREAMS; ++is) { + SYCL_CHECK(CHECK_TRY_ERROR( + g_syclStreams[i][is] = + dpct::get_current_device().create_queue( + g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device()))); + } + + const dpct::queue_ptr stream = g_syclStreams[i][0]; + // create sycl handle + SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream)); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + void *ggml_sycl_host_malloc(size_t size) try { if (getenv("GGML_SYCL_NO_PINNED") != nullptr) { return nullptr; @@ -16542,22 +16674,24 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = { /* .is_host = */ nullptr, }; -ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) { +ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) { + if (device_index>=g_device_count or device_index<0) { + printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", + device_index, g_device_count-1); + GGML_ASSERT(device_indexgpus[i])}, }; } - ggml_backend_sycl_buffer_type_initialized = true; + g_ggml_backend_sycl_buffer_type_initialized = true; } - - return &ggml_backend_sycl_buffer_types[device]; + return &ggml_backend_sycl_buffer_types[device_index]; } // sycl split buffer type @@ -17310,11 +17444,42 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) { return g_sycl_gpu_mgr->get_index(device_id); } +GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) { + return g_sycl_gpu_mgr->gpus[device_index]; +} + +GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) { + GGML_ASSERT(main_gpu_idget_gpu_count()); + g_ggml_backend_sycl_buffer_type_initialized = false; +} + +GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() { + if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) { + return; + } + + fprintf(stderr, "ggml_backend_sycl_set_mul_device_mode: true\n"); + + if (g_sycl_gpu_mgr) { + delete g_sycl_gpu_mgr; + } + g_sycl_gpu_mgr = new sycl_gpu_mgr(); + g_ggml_sycl_backend_gpu_mode = SYCL_MUL_GPU_MODE; + ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count()); + g_ggml_backend_sycl_buffer_type_initialized = false; +} + extern "C" int ggml_backend_sycl_reg_devices(); int ggml_backend_sycl_reg_devices() { - if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr(); - g_device_count = g_sycl_gpu_mgr->get_gpu_count(); + ggml_backend_sycl_set_mul_device_mode(); assert(g_device_count>0); for (int i = 0; i < g_device_count; i++) { int id = g_sycl_gpu_mgr->gpus[i]; diff --git a/ggml-sycl.h b/ggml-sycl.h index bf5b11b36..c549a64a1 100644 --- a/ggml-sycl.h +++ b/ggml-sycl.h @@ -29,6 +29,11 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_typ GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total); GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id); +// TODO: these are temporary +// ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670 +GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index); +GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id); +GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode(); #ifdef __cplusplus } #endif diff --git a/ggml.c b/ggml.c index 8176411d3..75245cca5 100644 --- a/ggml.c +++ b/ggml.c @@ -470,6 +470,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .type_size = sizeof(int32_t), .is_quantized = false, }, + [GGML_TYPE_I64] = { + .type_name = "i64", + .blck_size = 1, + .type_size = sizeof(int64_t), + .is_quantized = false, + }, + [GGML_TYPE_F64] = { + .type_name = "f64", + .blck_size = 1, + .type_size = sizeof(double), + .is_quantized = false, + .nrows = 1, + }, [GGML_TYPE_F32] = { .type_name = "f32", .blck_size = 1, @@ -918,6 +931,101 @@ inline static float vaddvq_f32(float32x4_t v) { #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE #endif +#elif defined(__AVX512F__) + +#define GGML_SIMD + +// F32 AVX512 + +#define GGML_F32_STEP 64 +#define GGML_F32_EPR 16 + +#define GGML_F32x16 __m512 +#define GGML_F32x16_ZERO _mm512_setzero_ps() +#define GGML_F32x16_SET1(x) _mm512_set1_ps(x) +#define GGML_F32x16_LOAD _mm512_loadu_ps +#define GGML_F32x16_STORE _mm512_storeu_ps +// _mm512_fmadd_ps is defined in AVX512F so no guard is required +#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a) +#define GGML_F32x16_ADD _mm512_add_ps +#define GGML_F32x16_MUL _mm512_mul_ps +#define GGML_F32x16_REDUCE(res, x) \ +do { \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + res = _mm512_reduce_add_ps(x[0]); \ +} while (0) + +// TODO: is this optimal ? + +#define GGML_F32_VEC GGML_F32x16 +#define GGML_F32_VEC_ZERO GGML_F32x16_ZERO +#define GGML_F32_VEC_SET1 GGML_F32x16_SET1 +#define GGML_F32_VEC_LOAD GGML_F32x16_LOAD +#define GGML_F32_VEC_STORE GGML_F32x16_STORE +#define GGML_F32_VEC_FMA GGML_F32x16_FMA +#define GGML_F32_VEC_ADD GGML_F32x16_ADD +#define GGML_F32_VEC_MUL GGML_F32x16_MUL +#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE + +// F16 AVX512 + +// F16 AVX + +#define GGML_F16_STEP 64 +#define GGML_F16_EPR 16 + +// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead + +#define GGML_F32Cx16 __m512 +#define GGML_F32Cx16_ZERO _mm512_setzero_ps() +#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x) + +// unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F +// so F16C guard isn't required +#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x))) +#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0)) + +#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a) +#define GGML_F32Cx16_ADD _mm512_add_ps +#define GGML_F32Cx16_MUL _mm512_mul_ps +#define GGML_F32Cx16_REDUCE(res, x) \ +do { \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm512_add_ps(x[i], x[offset+i]); \ + } \ + res = _mm512_reduce_add_ps(x[0]); \ +} while (0) + +#define GGML_F16_VEC GGML_F32Cx16 +#define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO +#define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1 +#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p) +#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i]) +#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA +#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD +#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL +#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE + #elif defined(__AVX__) #define GGML_SIMD @@ -12419,6 +12527,8 @@ static void ggml_compute_forward_alibi( case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: + case GGML_TYPE_I64: + case GGML_TYPE_F64: case GGML_TYPE_COUNT: { GGML_ASSERT(false); @@ -12505,6 +12615,8 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: + case GGML_TYPE_I64: + case GGML_TYPE_F64: case GGML_TYPE_COUNT: { GGML_ASSERT(false); diff --git a/ggml.h b/ggml.h index e549f9e0e..dac57957d 100644 --- a/ggml.h +++ b/ggml.h @@ -373,6 +373,8 @@ extern "C" { GGML_TYPE_I8 = 24, GGML_TYPE_I16 = 25, GGML_TYPE_I32 = 26, + GGML_TYPE_I64 = 27, + GGML_TYPE_F64 = 28, GGML_TYPE_COUNT, }; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 2d7cf16c1..4a4facb06 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -42,6 +42,7 @@ class Keys: EXPERT_COUNT = "{arch}.expert_count" EXPERT_USED_COUNT = "{arch}.expert_used_count" POOLING_TYPE = "{arch}.pooling_type" + LOGIT_SCALE = "{arch}.logit_scale" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -121,6 +122,7 @@ class MODEL_ARCH(IntEnum): GEMMA = auto() STARCODER2 = auto() MAMBA = auto() + COMMAND_R = auto() class MODEL_TENSOR(IntEnum): @@ -187,6 +189,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.GEMMA: "gemma", MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.MAMBA: "mamba", + MODEL_ARCH.COMMAND_R: "command-r", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -579,6 +582,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.SSM_D, MODEL_TENSOR.SSM_OUT, ], + MODEL_ARCH.COMMAND_R: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], # TODO } @@ -665,6 +680,8 @@ class GGMLQuantizationType(IntEnum): I8 = 24 I16 = 25 I32 = 26 + I64 = 27 + F64 = 28 class GGUFEndian(IntEnum): @@ -734,6 +751,8 @@ GGML_QUANT_SIZES = { GGMLQuantizationType.I8: (1, 1), GGMLQuantizationType.I16: (1, 2), GGMLQuantizationType.I32: (1, 4), + GGMLQuantizationType.I64: (1, 8), + GGMLQuantizationType.F64: (1, 8), } diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py index 1c10f5753..33afac552 100644 --- a/gguf-py/gguf/gguf_reader.py +++ b/gguf-py/gguf/gguf_reader.py @@ -242,12 +242,15 @@ class GGUFReader: n_bytes = n_elems * type_size // block_size data_offs = int(start_offs + offset_tensor[0]) item_type: npt.DTypeLike - if ggml_type == GGMLQuantizationType.F32: - item_count = n_elems - item_type = np.float32 - elif ggml_type == GGMLQuantizationType.F16: + if ggml_type == GGMLQuantizationType.F16: item_count = n_elems item_type = np.float16 + elif ggml_type == GGMLQuantizationType.F32: + item_count = n_elems + item_type = np.float32 + elif ggml_type == GGMLQuantizationType.F64: + item_count = n_elems + item_type = np.float64 elif ggml_type == GGMLQuantizationType.I8: item_count = n_elems item_type = np.int8 @@ -257,6 +260,9 @@ class GGUFReader: elif ggml_type == GGMLQuantizationType.I32: item_count = n_elems item_type = np.int32 + elif ggml_type == GGMLQuantizationType.I64: + item_count = n_elems + item_type = np.int64 else: item_count = n_bytes item_type = np.uint8 diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 81b2eb884..2ae6c814b 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -204,18 +204,22 @@ class GGUFWriter: for i in range(n_dims): self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i]) if raw_dtype is None: - if tensor_dtype == np.float32: - dtype = GGMLQuantizationType.F32 - elif tensor_dtype == np.float16: + if tensor_dtype == np.float16: dtype = GGMLQuantizationType.F16 + elif tensor_dtype == np.float32: + dtype = GGMLQuantizationType.F32 + elif tensor_dtype == np.float64: + dtype = GGMLQuantizationType.F64 elif tensor_dtype == np.int8: dtype = GGMLQuantizationType.I8 elif tensor_dtype == np.int16: dtype = GGMLQuantizationType.I16 elif tensor_dtype == np.int32: dtype = GGMLQuantizationType.I32 + elif tensor_dtype == np.int64: + dtype = GGMLQuantizationType.I64 else: - raise ValueError("Only F32, F16, I8, I16, I32 tensors are supported for now") + raise ValueError("Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now") else: dtype = raw_dtype self.ti_data += self._pack("I", dtype) @@ -357,6 +361,9 @@ class GGUFWriter: def add_clamp_kqv(self, value: float) -> None: self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value) + def add_logit_scale(self, value: float) -> None: + self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value) + def add_expert_count(self, count: int) -> None: self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count) diff --git a/klite.embd b/klite.embd index fb14baf4b..3aeeaf62c 100644 --- a/klite.embd +++ b/klite.embd @@ -7,7 +7,7 @@ Just copy this single static HTML file anywhere and open it in a browser, or fro Please go to https://github.com/LostRuins/lite.koboldai.net for updates on Kobold Lite. If you are submitting a pull request for Lite, PLEASE use the above repo, not the KoboldCpp one. Kobold Lite is under the AGPL v3.0 License unless otherwise exempted. Please do not remove this line. -Current version: 124 +Current version: 125 -Concedo --> @@ -373,21 +373,6 @@ Current version: 124 margin-top: 100px; } - - .loadpopup { - width: 600px; - background-color: #262626; - margin-top: 150px; - } - - @media (max-width: 768px) { - .loadpopup { - width: 100%; - background-color: #262626; - margin-top: 150px; - } - } - .workerpopup { background-color: #262626; margin-top: 170px; @@ -420,7 +405,7 @@ Current version: 124 width: 330px; } .nspopup.flexsize { - width: 540px; + width: 600px; } @media (max-width: 620px) { .nspopup.flexsize { @@ -2966,7 +2951,7 @@ Current version: 124 } } - function apply_proxy_url(url) + function apply_proxy_url(url, proxy_by_default=false) { let proxy_part = ""; @@ -2980,7 +2965,7 @@ Current version: 124 !url.toLowerCase().includes(".")); } - if (uses_cors_proxy && !is_local) { + if ((uses_cors_proxy||proxy_by_default) && !is_local) { proxy_part = cors_proxy + "?"; } return proxy_part + url; @@ -3357,7 +3342,7 @@ Current version: 124 const text_hordes = [ { - baseurl: "https://horde.koboldai.net", + baseurl: "https://aihorde.net", tag: "🤖", sort_order: 1, client_agent: default_client_agent, @@ -3415,8 +3400,6 @@ Current version: 124 const default_oai_image_endpoint = "/images/generations"; - const scale_submit_endpoint = "https://dashboard.scale.com/spellbook/api/v2/deploy/" - const claude_submit_endpoint = "/complete"; const claude_submit_endpoint_v3 = "/messages"; @@ -3498,9 +3481,7 @@ Current version: 124 var custom_oai_endpoint = ""; var custom_oai_key = ""; //if set, uses the OpenAI API to generate var custom_oai_model = ""; - var custom_scale_key = ""; var custom_palm_key = ""; - var custom_scale_ID = ""; var custom_claude_endpoint = ""; var custom_claude_key = ""; var custom_claude_model = ""; @@ -3826,8 +3807,8 @@ Current version: 124 if(!localflag && pending_eptype>0) { msgboxYesNo("Reconnect to previous custom endpoint?","Custom Endpoint Reconnect",()=>{ - document.getElementById("customapidropdown").value = (pending_eptype - 1).toString(); - display_custom_endpoint(); + document.getElementById("customapidropdown").value = (pending_eptype).toString(); + display_endpoint_container(); },null); } } @@ -4032,7 +4013,7 @@ Current version: 124 function attempt_connect(popup_aiselect = true) { if (localflag) { - document.getElementById("customapidropdown").value = 0; + document.getElementById("customapidropdown").value = 1; let protocol = "http://"; if(window.location.protocol.includes('https') && !is_using_web_lite()) { @@ -4077,7 +4058,7 @@ Current version: 124 } } document.body.classList.add("connected"); - document.getElementById("connectstatus").innerHTML = "Connected to KoboldAI Horde"; + document.getElementById("connectstatus").innerHTML = "Connected to AI Horde"; document.getElementById("connectstatus").classList.remove("color_orange"); document.getElementById("connectstatus").classList.add("color_green"); render_gametext(false); @@ -4085,12 +4066,12 @@ Current version: 124 read_url_params_data(); if (popup_aiselect) { - display_models(); + display_endpoint_container(); } } else { - msgbox("Failed to connect to KAI Horde!\nPlease check your network connection."); + msgbox("Failed to connect to AI Service!\nPlease check your network connection."); document.body.classList.remove("connected"); document.getElementById("connectstatus").innerHTML = "Offline Mode"; document.getElementById("connectstatus").classList.add("color_orange"); @@ -4475,7 +4456,7 @@ Current version: 124 function is_using_custom_ep() { - return (custom_oai_key!=""||custom_kobold_endpoint!=""||custom_scale_key!=""||custom_claude_key!=""||custom_palm_key!=""); + return (custom_oai_key!=""||custom_kobold_endpoint!=""||custom_claude_key!=""||custom_palm_key!=""); } function is_using_kcpp_with_streaming() @@ -5517,7 +5498,7 @@ Current version: 124 //remove common malformed ids to reduce load if(userinput!="" && isNumeric(userinput) && userinput>0 && userinput<50000) { - fetch(cors_proxy+"?https://aetherroom.club/api/"+userinput) + fetch(apply_proxy_url("https://aetherroom.club/api/"+userinput,true)) .then(x => x.json()) .then(data => { console.log(data); @@ -5718,12 +5699,13 @@ Current version: 124 }; document.getElementById("scenariodesc").innerText = "Loading scenario from Pygmalion.Chat..."; - fetch(cors_proxy+"?https://server.pygmalion.chat/galatea.v1.PublicCharacterService/CharacterExport", { - method: 'POST', + let charurl = "https://server.pygmalion.chat/api/export/character/"+userinput+"/v2"; + fetch(apply_proxy_url(charurl,true), { + method: 'GET', headers: { 'Content-Type': 'application/json', }, - body: JSON.stringify({ "character_id": userinput }), + // body: JSON.stringify({ "character_id": userinput }), referrerPolicy: 'no-referrer', }) .then(x => { @@ -5738,12 +5720,12 @@ Current version: 124 }) .then(data => { console.log(data); - if(data && data.card) //if fetch was successful + if(data && data.character) //if fetch was successful { - load_temp_scenario_from_tavernobj(data.card,true); - if(data.card.data && data.card.data.avatar) + load_temp_scenario_from_tavernobj(data.character,true); + if(data.character.data && data.character.data.avatar) { - const compressedImg = compressImage(data.card.data.avatar, (compressedImageURI, aspectratio)=>{ + const compressedImg = compressImage(data.character.data.avatar, (compressedImageURI, aspectratio)=>{ temp_scenario.image = compressedImageURI; temp_scenario.image_aspect = aspectratio; preview_temp_scenario(); @@ -6433,7 +6415,6 @@ Current version: 124 { return !( document.getElementById("saveloadcontainer").classList.contains("hidden") && - document.getElementById("loadmodelcontainer").classList.contains("hidden") && document.getElementById("newgamecontainer").classList.contains("hidden") && document.getElementById("yesnocontainer").classList.contains("hidden") && document.getElementById("settingscontainer").classList.contains("hidden") && @@ -6456,7 +6437,6 @@ Current version: 124 } function hide_popups() { document.getElementById("saveloadcontainer").classList.add("hidden"); - document.getElementById("loadmodelcontainer").classList.add("hidden"); document.getElementById("newgamecontainer").classList.add("hidden"); document.getElementById("yesnocontainer").classList.add("hidden"); document.getElementById("settingscontainer").classList.add("hidden"); @@ -6525,7 +6505,7 @@ Current version: 124 function explain_horde() { - msgbox("The AI Horde generates text using crowdsourced GPUs by volunteer workers. By default your inputs are not logged, but as Horde workers are open source, they can be modified to do so.

In all cases, the sender will *always be anonymous*, however you are still advised to avoid sending privacy sensitive information.

For any issues, you can find us on discord at https://koboldai.org/discord","Disclaimer",true); + msgbox("The AI Horde generates text using crowdsourced GPUs by volunteer workers. By default your inputs are not logged, but as Horde workers are open source, they can be modified to do so.

In all cases, the sender will *always be anonymous*, however you are still advised to avoid sending privacy sensitive information.
","Disclaimer",true); } function selectImgStyle() @@ -6725,7 +6705,7 @@ Current version: 124 function select_custom_oai_model() { - let isOpenrouter = (document.getElementById("customapidropdown").value==5); + let isOpenrouter = (document.getElementById("customapidropdown").value==3); inputBox("Enter custom model name","Custom Model Name",localsettings.saved_oai_custommodel,"", ()=>{ let coai = getInputBoxValue().trim(); let dropdown = (isOpenrouter?document.getElementById("custom_openrouter_model"):document.getElementById("custom_oai_model")); @@ -6742,7 +6722,7 @@ Current version: 124 } function oai_model_change() { - let isOpenrouter = (document.getElementById("customapidropdown").value==5); + let isOpenrouter = (document.getElementById("customapidropdown").value==3); let dropdown = (isOpenrouter?document.getElementById("custom_openrouter_model"):document.getElementById("custom_oai_model")); let non_completions = (dropdown.value.includes("davinci-002") || dropdown.value.includes("text-davinci-003") || dropdown.value.includes("text-davinci-002") || dropdown.value.includes("text-davinci-001") || dropdown.value.includes("gpt-3.5-turbo-instruct") || dropdown.value == "davinci"); @@ -6790,7 +6770,7 @@ Current version: 124 if (!data.error && data.data && data.data.length > 0) { - let isOpenrouter = (document.getElementById("customapidropdown").value==5); + let isOpenrouter = (document.getElementById("customapidropdown").value==3); let dropdown = (isOpenrouter?document.getElementById("custom_openrouter_model"):document.getElementById("custom_oai_model")); var lastOption = dropdown.lastElementChild; for (var i = dropdown.options.length - 1; i >= 0; i--) { @@ -6848,12 +6828,18 @@ Current version: 124 let epchoice = document.getElementById("customapidropdown").value; document.getElementById("oaicustom").classList.add("hidden"); document.getElementById("koboldcustom").classList.add("hidden"); - document.getElementById("scalecustom").classList.add("hidden"); document.getElementById("claudecustom").classList.add("hidden"); document.getElementById("palmcustom").classList.add("hidden"); document.getElementById("custom_oai_model").classList.add("hidden"); document.getElementById("custom_openrouter_model").classList.add("hidden"); + document.getElementById("hordeloadmodelcontainer").classList.add("hidden"); + if(epchoice==0) + { + document.getElementById("hordeloadmodelcontainer").classList.remove("hidden"); + display_horde_models(); + } + else if(epchoice==1) { document.getElementById("koboldcustom").classList.remove("hidden"); if(!localflag) @@ -6862,10 +6848,19 @@ Current version: 124 document.getElementById("customkoboldkey").value = localsettings.saved_kai_key; } } - else if(epchoice==1 || epchoice==5) + else if(epchoice==2 || epchoice==3) { document.getElementById("oaicustom").classList.remove("hidden"); - if(epchoice==5) + if(epchoice==2) + { + document.getElementById("oaidesc").classList.remove("hidden"); + document.getElementById("custom_oai_model").classList.remove("hidden"); + document.getElementById("openrouterdesc").classList.add("hidden"); + document.getElementById("custom_oai_endpoint").classList.remove("hidden"); + document.getElementById("custom_oai_key").value = localsettings.saved_oai_key; + document.getElementById("custom_oai_endpoint").value = (localsettings.saved_oai_addr?localsettings.saved_oai_addr:default_oai_base); + } + else { document.getElementById("oaidesc").classList.add("hidden"); document.getElementById("openrouterdesc").classList.remove("hidden"); @@ -6883,24 +6878,10 @@ Current version: 124 } } } - else - { - document.getElementById("oaidesc").classList.remove("hidden"); - document.getElementById("custom_oai_model").classList.remove("hidden"); - document.getElementById("openrouterdesc").classList.add("hidden"); - document.getElementById("custom_oai_endpoint").classList.remove("hidden"); - document.getElementById("custom_oai_key").value = localsettings.saved_oai_key; - document.getElementById("custom_oai_endpoint").value = (localsettings.saved_oai_addr?localsettings.saved_oai_addr:default_oai_base); - - } oai_model_change(); toggleoaichatcompl(); } - else if(epchoice==2) - { - document.getElementById("scalecustom").classList.remove("hidden"); - } - else if(epchoice==3) + else if(epchoice==4) { toggleclaudemodel(); document.getElementById("claudecustom").classList.remove("hidden"); @@ -6909,7 +6890,7 @@ Current version: 124 document.getElementById("claudesystemprompt").value = localsettings.saved_claude_jailbreak; document.getElementById("claudejailbreakprompt").value = localsettings.saved_claude_jailbreak2; } - else if(epchoice==4) + else if(epchoice==5) { document.getElementById("palmcustom").classList.remove("hidden"); document.getElementById("custom_palm_key").value = localsettings.saved_palm_key; @@ -6952,12 +6933,15 @@ Current version: 124 custom_kobold_endpoint = ""; custom_kobold_key = ""; custom_oai_key = ""; - custom_scale_key = ""; custom_claude_key = ""; custom_palm_key = ""; let epchoice = document.getElementById("customapidropdown").value; - if(epchoice==0) //connect to kobold endpoint + if(epchoice==0) //ai horde + { + confirm_horde_models(); + } + else if(epchoice==1) //connect to kobold endpoint { let desiredkoboldendpoint = document.getElementById("customkoboldendpoint").value; let desiredkoboldkey = document.getElementById("customkoboldkey").value; @@ -7203,7 +7187,7 @@ Current version: 124 }); } } - else if(epchoice==1 || epchoice==5) //connect to OAI / OpenRouter Endpoint + else if(epchoice==2 || epchoice==3) //connect to OAI / OpenRouter Endpoint { let desired_oai_key = document.getElementById("custom_oai_key").value.trim(); let desired_oai_ep = document.getElementById("custom_oai_endpoint").value.trim(); @@ -7230,7 +7214,7 @@ Current version: 124 //good to go custom_oai_endpoint = desired_oai_ep; custom_oai_key = desired_oai_key; - if(epchoice==1) + if(epchoice==2) { localsettings.saved_oai_key = custom_oai_key; localsettings.saved_oai_addr = custom_oai_endpoint; @@ -7246,7 +7230,7 @@ Current version: 124 } localsettings.saved_oai_role = document.getElementById("oairoledropdown").value; localsettings.saved_oai_jailbreak2 = document.getElementById("jailbreakprompttext2").value; - let isOpenrouter = (document.getElementById("customapidropdown").value==5); + let isOpenrouter = (document.getElementById("customapidropdown").value==3); let dropdown = (isOpenrouter?document.getElementById("custom_openrouter_model"):document.getElementById("custom_oai_model")); custom_oai_model = dropdown.value.trim(); localsettings.saved_oai_custommodel = custom_oai_model; @@ -7268,74 +7252,7 @@ Current version: 124 render_gametext(true); } } - else if(epchoice==2) //connect to Scale Endpoint - { - let desired_scale_key = document.getElementById("custom_scale_key").value.trim(); - let desired_scale_ID = document.getElementById("custom_scale_ID").value.trim(); - - desired_scale_ID = desired_scale_ID.split("#")[0]; - desired_scale_ID = desired_scale_ID.split("?")[0]; - if(desired_scale_ID.includes("dashboard.scale.com/spellbook/api/v2/deploy/") && - desired_scale_key.length == 25 &&!desired_scale_key.includes(" ")&&!desired_scale_key.includes("/")) - { - desired_scale_ID = desired_scale_ID.split("dashboard.scale.com/spellbook/api/v2/deploy/")[1]; - } - else - { - desired_scale_ID = ""; - desired_scale_key = ""; - msgbox("Invalid inputs, please try again."); - } - - if(desired_scale_key!="" && desired_scale_ID!="") - { - hide_popups(); - fetch(cors_proxy+"?"+ scale_submit_endpoint+desired_scale_ID, { - method: 'GET', - headers: { - 'Authorization': 'Bearer '+desired_scale_key, - }, - referrerPolicy: 'no-referrer', - }) - .then((response) => response.json()) - .then((data) => { - console.log(data); - if (data.message && data.message!="") - { - //good to go - custom_scale_key = desired_scale_key; - custom_scale_ID = desired_scale_ID; - selected_models = [{ "performance": 100.0, "queued": 0.0, "eta": 0, "name": "SpellbookScaleAI", "count": 1 }]; - selected_workers = []; - if (perfdata == null) { - //generate some fake perf data if horde is offline and using custom endpoint - perfdata = { - "queued_requests": 0, - "queued_tokens": 0, - "past_minute_tokens": 0, - "worker_count": 0 - }; - document.body.classList.add("connected"); - document.getElementById("connectstatus").classList.remove("color_orange"); - document.getElementById("connectstatus").classList.add("color_green"); - } - document.getElementById("connectstatus").innerHTML = "Connected to ScaleAI Endpoint"; - render_gametext(); - } - else - { - custom_scale_key = ""; - msgbox("Cannot connect to Spellbook by ScaleAI"); - } - }) - .catch(error => { - console.log("Error: " + error); - custom_scale_key = ""; - msgbox("Error: " + error); - }); - } - } - else if(epchoice==3) //claude endpoint + else if(epchoice==4) //claude endpoint { let desired_claude_key = document.getElementById("custom_claude_key").value.trim(); let desired_claude_ep = document.getElementById("custom_claude_endpoint").value.trim(); @@ -7387,7 +7304,7 @@ Current version: 124 } } - else if(epchoice==4) //palm endpoint + else if(epchoice==5) //palm endpoint { let desired_palm_key = document.getElementById("custom_palm_key").value.trim(); let mdlname = document.getElementById("custom_palm_model").value; @@ -7421,12 +7338,12 @@ Current version: 124 } - function display_custom_endpoint() + function display_endpoint_container() { document.getElementById("customendpointcontainer").classList.remove("hidden"); customapi_dropdown(); } - function dismiss_custom_endpoint() + function dismiss_endpoint_container() { document.getElementById("customendpointcontainer").classList.add("hidden"); } @@ -7584,9 +7501,8 @@ Current version: 124 } //function to allow selection of models - function display_models() { + function display_horde_models() { document.getElementById("pickedmodel").innerHTML = ""; - document.getElementById("loadmodelcontainer").classList.remove("hidden"); document.getElementById("apikey").value = localsettings.my_api_key; document.getElementById("modelquicksearch").value = ""; let manualworker = (document.getElementById("manualworker").checked ? true : false); @@ -7643,7 +7559,6 @@ Current version: 124 } model_choices += ""; } - model_choices += ""; document.getElementById("pickedmodel").innerHTML = model_choices; } } @@ -7689,92 +7604,79 @@ Current version: 124 } } - function confirm_models() { + function confirm_horde_models() { let selected_idx_arr = Array.from(document.getElementById("pickedmodel").selectedOptions).map(({ value }) => value); - if (selected_idx_arr.length == 1 && selected_idx_arr[0] == 9999) //custom endpoint - { - hide_popups(); - display_custom_endpoint(); - } else { - custom_kobold_endpoint = ""; - custom_oai_key = ""; - custom_scale_key = ""; - custom_claude_key = ""; - custom_palm_key = ""; - //remove the Custom Endpoint if it's multi selected together with others. - const findex = selected_idx_arr.indexOf("9999"); - if (findex > -1) { - selected_idx_arr.splice(findex, 1); - } + custom_kobold_endpoint = ""; + custom_oai_key = ""; + custom_claude_key = ""; + custom_palm_key = ""; - if (selected_idx_arr.length > 0) { - let prep_sel_models = []; - let prep_sel_workers = []; //if selected, pick a specific worker ids to use + if (selected_idx_arr.length > 0) { + let prep_sel_models = []; + let prep_sel_workers = []; //if selected, pick a specific worker ids to use - let manualworker = (document.getElementById("manualworker").checked ? true : false); + let manualworker = (document.getElementById("manualworker").checked ? true : false); - for (var i = 0; i < selected_idx_arr.length; ++i) { - if (manualworker) //we are looping through selected workers - { - let addedworker = worker_data[selected_idx_arr[i]]; - prep_sel_workers.push(addedworker); - let modnames = addedworker.models; - for (var j = 0; j < modnames.length; ++j) { - let addedmodel = models_data.find(element => (element.name == modnames[j] && element.cluster==addedworker.cluster)); - if (!prep_sel_models.includes(addedmodel)) { - prep_sel_models.push(addedmodel); - } + for (var i = 0; i < selected_idx_arr.length; ++i) { + if (manualworker) //we are looping through selected workers + { + let addedworker = worker_data[selected_idx_arr[i]]; + prep_sel_workers.push(addedworker); + let modnames = addedworker.models; + for (var j = 0; j < modnames.length; ++j) { + let addedmodel = models_data.find(element => (element.name == modnames[j] && element.cluster==addedworker.cluster)); + if (!prep_sel_models.includes(addedmodel)) { + prep_sel_models.push(addedmodel); } } - else //we are looping through selected models - { - let addedmodel = models_data[selected_idx_arr[i]]; - prep_sel_models.push(addedmodel); - } } - - //remove undefined and nulls - prep_sel_models = prep_sel_models.filter(x=>x); - prep_sel_workers = prep_sel_workers.filter(x=>x); - - const allMatched1 = prep_sel_models.every(item => item.cluster === prep_sel_models[0].cluster); - const allMatched2 = prep_sel_workers.every(item => item.cluster === prep_sel_workers[0].cluster); - - if(!allMatched1 || !allMatched2) + else //we are looping through selected models { - if (prep_sel_workers.length > 0) { - let pickedcluster = get_most_common_cluster(prep_sel_workers); - prep_sel_workers = prep_sel_workers.filter(item => item.cluster === pickedcluster); - prep_sel_models = prep_sel_models.filter(item => item.cluster === pickedcluster); - } else { - let pickedcluster = get_most_common_cluster(prep_sel_models); - prep_sel_models = prep_sel_models.filter(item => item.cluster === pickedcluster); - } + let addedmodel = models_data[selected_idx_arr[i]]; + prep_sel_models.push(addedmodel); } + } - selected_models = prep_sel_models; - selected_workers = prep_sel_workers; - localsettings.my_api_key = document.getElementById("apikey").value; - if(localsettings.my_api_key==null || localsettings.my_api_key=="") - { - localsettings.my_api_key = defaultsettings.my_api_key; - } - if (desired_new_home_cluster != null) { - localsettings.home_cluster = desired_new_home_cluster; - desired_new_home_cluster = null; + //remove undefined and nulls + prep_sel_models = prep_sel_models.filter(x=>x); + prep_sel_workers = prep_sel_workers.filter(x=>x); + + const allMatched1 = prep_sel_models.every(item => item.cluster === prep_sel_models[0].cluster); + const allMatched2 = prep_sel_workers.every(item => item.cluster === prep_sel_workers[0].cluster); + + if(!allMatched1 || !allMatched2) + { + if (prep_sel_workers.length > 0) { + let pickedcluster = get_most_common_cluster(prep_sel_workers); + prep_sel_workers = prep_sel_workers.filter(item => item.cluster === pickedcluster); + prep_sel_models = prep_sel_models.filter(item => item.cluster === pickedcluster); + } else { + let pickedcluster = get_most_common_cluster(prep_sel_models); + prep_sel_models = prep_sel_models.filter(item => item.cluster === pickedcluster); } + } - document.getElementById("connectstatus").innerHTML = "Connected to KoboldAI Horde"; + selected_models = prep_sel_models; + selected_workers = prep_sel_workers; + localsettings.my_api_key = document.getElementById("apikey").value; + if(localsettings.my_api_key==null || localsettings.my_api_key=="") + { + localsettings.my_api_key = defaultsettings.my_api_key; + } + if (desired_new_home_cluster != null) { + localsettings.home_cluster = desired_new_home_cluster; + desired_new_home_cluster = null; + } - render_gametext(); - hide_popups(); + document.getElementById("connectstatus").innerHTML = "Connected to AI Horde"; - if(!allMatched1 || !allMatched2) - { - msgbox("You've selected multiple workers from different clusters. Only one cluster will be used.","Caution"); - } + render_gametext(); + hide_popups(); + if(!allMatched1 || !allMatched2) + { + msgbox("You've selected multiple workers from different clusters. Only one cluster will be used.","Caution"); } } } @@ -7902,7 +7804,7 @@ Current version: 124 if(uname.toLowerCase()=="anonymous#0") { document.getElementById("kudos_bal").innerHTML = clustertag + uname + "
"+ - "(Register New User)"; + "(Register New User)"; }else{ document.getElementById("showownworkerslink").classList.remove("hidden"); } @@ -7913,12 +7815,12 @@ Current version: 124 } else { - document.getElementById("kudos_bal").innerHTML = "API Key Error
(Register New User)"; + document.getElementById("kudos_bal").innerHTML = "API Key Error
(Register New User)"; } } else { console.log("Error: " + errArr); - document.getElementById("kudos_bal").innerHTML = "API Key Error
(Register New User)"; + document.getElementById("kudos_bal").innerHTML = "API Key Error
(Register New User)"; } }); } @@ -9131,6 +9033,10 @@ Current version: 124 }, true, false, imgres,0.35,true); } + function clear_paste_window() + { + document.getElementById("pasteimgwin").value = ""; + } function img_paste_event(event) { var items = (event.clipboardData || event.originalEvent.clipboardData).items; @@ -10155,46 +10061,10 @@ Current version: 124 msgbox("Error while submitting prompt: " + error); }); } - else if (custom_scale_key != "")//handle for Scale - { - let targetep = cors_proxy + "?" + scale_submit_endpoint + custom_scale_ID; - let scale_payload = { "input": { "input": submit_payload.prompt } }; - - last_request_str = JSON.stringify(scale_payload); - fetch(targetep, { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - 'Authorization': 'Basic ' + custom_scale_key, - }, - body: JSON.stringify(scale_payload), - referrerPolicy: 'no-referrer', - }) - .then((response) => response.json()) - .then((data) => { - console.log("sync finished response: " + JSON.stringify(data)); - if (custom_scale_key != "" && data.output != null && data.output != "") { - synchro_polled_response = data.output; - } - else { - //error occurred, maybe captcha failed - console.error("error occurred in Scale generation"); - clear_poll_flags(); - render_gametext(); - msgbox("Error occurred during text generation: " + formatError(data)); - } - }) - .catch((error) => { - console.error('Error:', error); - clear_poll_flags(); - render_gametext(); - msgbox("Error while submitting prompt: " + error); - }); - } else if (custom_claude_key != "")//handle for Claude { let claudev3mode = custom_claude_model.toLowerCase().includes("claude-3"); - let targetep = cors_proxy + "?" + (custom_claude_endpoint + (claudev3mode?claude_submit_endpoint_v3:claude_submit_endpoint)); + let targetep = apply_proxy_url((custom_claude_endpoint + (claudev3mode?claude_submit_endpoint_v3:claude_submit_endpoint)),true); let claude_payload = null; if(claudev3mode) { @@ -12041,10 +11911,6 @@ Current version: 124 { whorun = "
You're using the OpenAI API"; } - else if(custom_scale_key!="") - { - whorun = "
You're using the Spellbook by Scale AI API"; - } else if(custom_claude_key!="") { whorun = "
You're using the Claude API"; @@ -12281,8 +12147,8 @@ Current version: 124 document.getElementById("fvico").href = favivon_normal; } else if (selected_models.length == 0 && selected_workers.length == 0) { - let perfinfo = "There are " + perfdata.worker_count + " total volunteer(s) in the KoboldAI Horde, and " + perfdata.queued_requests + " request(s) in queues.
A total of " + perfdata.past_minute_tokens + " tokens were generated in the last minute.

"; - document.getElementById("gametext").innerHTML = "Welcome to KoboldAI Lite!

" + perfinfo + "Please select an AI model to use!
"; + let perfinfo = "There are " + perfdata.worker_count + " total volunteer(s) in the AI Horde, and " + perfdata.queued_requests + " request(s) in queues.
A total of " + perfdata.past_minute_tokens + " tokens were generated in the last minute.

"; + document.getElementById("gametext").innerHTML = "Welcome to KoboldAI Lite!

" + perfinfo + "Please select an AI service to use!
"; document.getElementById("fvico").href = favivon_normal; } else if (pending_response_id == "") { @@ -12596,13 +12462,9 @@ Current version: 124 localsettings.prev_custom_endpoint_type = 2; if(custom_oai_endpoint.toLowerCase().includes("openrouter.ai")) { - localsettings.prev_custom_endpoint_type = 6; + localsettings.prev_custom_endpoint_type = 3; } } - else if(custom_scale_key!="") - { - localsettings.prev_custom_endpoint_type = 3; - } else if(custom_claude_key!="") { localsettings.prev_custom_endpoint_type = 4; @@ -13553,11 +13415,11 @@ Current version: 124