diff --git a/common/chat-auto-parser-generator.cpp b/common/chat-auto-parser-generator.cpp index 453559a4b..2fac0ea5c 100644 --- a/common/chat-auto-parser-generator.cpp +++ b/common/chat-auto-parser-generator.cpp @@ -136,10 +136,10 @@ common_peg_parser analyze_reasoning::build_parser(parser_build_context & ctx) co if (!end.empty()) { if (!start.empty()) { // Standard tag-based: optional(reasoning) - return p.optional(start + p.reasoning(p.until(end)) + end + p.space()); + return p.optional(p.optspace(start) + p.reasoning(p.until(trim_whitespace(end))) + p.optspace(end)); } // Delimiter-style (empty start) - return p.optional(p.reasoning(p.until(end)) + end + p.space()); + return p.optional(p.reasoning(p.until(trim_whitespace(end))) + p.optspace(end)); } } @@ -186,7 +186,6 @@ common_peg_parser analyze_tools::build_parser(parser_build_context & ctx) const common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_context & ctx) const { auto & p = ctx.p; const auto & inputs = ctx.inputs; - bool force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED; // Build effective field names with dot notation if function_field is set std::string name_field = format.name_field; @@ -225,8 +224,7 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont tool_start = format.per_call_start; } - return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(p.until(tool_start)))) + tools_parser + - p.end(); + return ctx.reasoning_parser + p.optional(p.content(p.until(tool_start))) + tools_parser + p.end(); } common_peg_parser analyze_tools::build_func_parser(common_chat_peg_builder & p, const std::string & name, @@ -270,7 +268,6 @@ common_peg_parser analyze_tools::build_func_parser(common_chat_peg_builder & p, common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context & ctx) const { auto & p = ctx.p; const auto & inputs = ctx.inputs; - bool force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED; common_peg_parser tool_choice = p.choice(); @@ -336,14 +333,12 @@ common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context std::string trigger_marker = !format.section_start.empty() ? format.section_start : format.per_call_start; auto content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker); - return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls + - p.end(); + return ctx.reasoning_parser + p.optional(p.content(content_before_tools)) + tool_calls + p.end(); } common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_context & ctx) const { auto & p = ctx.p; const auto & inputs = ctx.inputs; - bool force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED; auto until_suffix = p.rule("until-suffix", p.until(arguments.value_suffix)); @@ -471,8 +466,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte std::string trigger_marker = !format.section_start.empty() ? format.section_start : format.per_call_start; auto content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker); - return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls + - p.end(); + return ctx.reasoning_parser + p.optional(p.content(content_before_tools)) + tool_calls + p.end(); } } // namespace autoparser diff --git a/common/chat-diff-analyzer.cpp b/common/chat-diff-analyzer.cpp index 264ace462..9c7c9678a 100644 --- a/common/chat-diff-analyzer.cpp +++ b/common/chat-diff-analyzer.cpp @@ -342,7 +342,7 @@ void analyze_reasoning::compare_thinking_enabled() { if (left_trimmed.empty() && !diff.right.empty()) { if (!right_trimmed.empty() && string_ends_with(comparison->output_B, right_trimmed)) { if (start.empty()) { - start = trim_leading_whitespace(diff.right); + start = diff.right; mode = reasoning_mode::TAG_BASED; } } @@ -353,7 +353,7 @@ void analyze_reasoning::compare_thinking_enabled() { if (seg.size() >= 2 && seg[seg.size() - 1].value == left_trimmed && seg[seg.size() - 2].type == segment_type::MARKER) { start = seg[seg.size() - 2].value; } - end = trim_trailing_whitespace(diff.left); + end = diff.left; mode = reasoning_mode::TAG_BASED; } } @@ -445,14 +445,14 @@ void analyze_reasoning::compare_reasoning_scope() { auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B); if (result.result.success()) { start = result.tags["pre"]; - end = trim_trailing_whitespace(result.tags["post"]); + end = result.tags["post"]; } else { auto parser_delimiter = build_tagged_peg_parser([&](common_peg_parser_builder &p) { return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space()))); }); result = parser_delimiter.parse_anywhere_and_extract(comparison->output_B); if (result.result.success()) { - end = trim_trailing_whitespace(result.tags["post"]); + end = result.tags["post"]; } else { LOG_DBG(ANSI_ORANGE "%s: Unable to extract reasoning markers, falling back to reasoning = NONE\n" ANSI_RESET, __func__); mode = reasoning_mode::NONE; diff --git a/common/chat-peg-parser.cpp b/common/chat-peg-parser.cpp index 56eb567df..a4818859a 100644 --- a/common/chat-peg-parser.cpp +++ b/common/chat-peg-parser.cpp @@ -816,6 +816,32 @@ common_peg_parser common_chat_peg_builder::prefix(const std::string & s, const s return literal(s.substr(0, s.rfind(delimiter))); } +common_peg_parser common_chat_peg_builder::optspace(const std::string & tag) { + auto parser = eps(); + size_t end_of_prefix_space = tag.size(); + size_t start_of_suffix_space = tag.size(); + for (size_t i = 0; i < tag.size(); i++) { + if (!std::isspace(tag[i])) { + end_of_prefix_space = i; + break; + } + } + for (size_t i = tag.size(); i > 0; i--) { + if (!std::isspace(tag[i - 1])) { + start_of_suffix_space = i; + break; + } + } + for (size_t i = 0; i < end_of_prefix_space; i++) { + parser += optional(literal(std::string(1, tag[i]))); + } + parser += literal(tag.substr(end_of_prefix_space, start_of_suffix_space - end_of_prefix_space)); + for (size_t i = start_of_suffix_space; i < tag.size(); i++) { + parser += optional(literal(std::string(1, tag[i]))); + } + return parser; +} + common_peg_parser common_chat_peg_builder::standard_json_tools( const std::string & section_start, const std::string & section_end, diff --git a/common/chat-peg-parser.h b/common/chat-peg-parser.h index 1ea3eb7eb..c684d7735 100644 --- a/common/chat-peg-parser.h +++ b/common/chat-peg-parser.h @@ -96,6 +96,9 @@ class common_chat_peg_builder : public common_peg_parser_builder { // Return a parser that parses the prefix of a string, up to a given delimiter. common_peg_parser prefix(const std::string & s, const std::string & delimiter = {}); + // Return a parser that parses all elements of tag, but leading and trailing spaces are optional + common_peg_parser optspace(const std::string & tag); + // Legacy-compatible helper for building standard JSON tool calls // Used by tests and manual parsers // name_key/args_key: JSON key names for function name and arguments diff --git a/common/chat.cpp b/common/chat.cpp index ade142c5c..5364a6e01 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -2221,8 +2221,8 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_ auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser); auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE; if (auto_params.supports_thinking) { - auto_params.thinking_start_tag = autoparser.reasoning.start; - auto_params.thinking_end_tag = autoparser.reasoning.end; + auto_params.thinking_start_tag = trim_whitespace(autoparser.reasoning.start); + auto_params.thinking_end_tag = trim_whitespace(autoparser.reasoning.end); } auto_params.generation_prompt = params.generation_prompt; common_peg_arena arena; diff --git a/common/reasoning-budget.cpp b/common/reasoning-budget.cpp index c6e1f86c9..8c1f72fc2 100644 --- a/common/reasoning-budget.cpp +++ b/common/reasoning-budget.cpp @@ -158,6 +158,8 @@ static void common_reasoning_budget_apply(struct llama_sampler * smpl, llama_tok for (size_t i = 0; i < cur_p->size; i++) { if (cur_p->data[i].id != forced) { cur_p->data[i].logit = -INFINITY; + } else { + cur_p->data[i].logit = +INFINITY; // force the token } } } diff --git a/scripts/server-test-function-call.py b/scripts/server-test-function-call.py index b3aae1a96..c32f17b5e 100755 --- a/scripts/server-test-function-call.py +++ b/scripts/server-test-function-call.py @@ -79,7 +79,7 @@ def print_info(msg): # --------------------------------------------------------------------------- -def chat_completion(url, messages, tools=None, stream=False): +def chat_completion(url, messages, tools=None, stream=False, force_tools=False): payload = { "messages": messages, "stream": stream, @@ -87,7 +87,10 @@ def chat_completion(url, messages, tools=None, stream=False): } if tools: payload["tools"] = tools - payload["tool_choice"] = "auto" + if force_tools: + payload["tool_choice"] = "required" + else: + payload["tool_choice"] = "auto" try: response = requests.post(url, json=payload, stream=stream) @@ -160,7 +163,13 @@ def chat_completion(url, messages, tools=None, stream=False): return result -def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turns=6): +def all_tools_called(tools, all_tool_calls): + all_tool_names = set([tc["function"]["name"] for tc in tools]) + all_called_tool_names = set([tc["function"]["name"] for tc in all_tool_calls]) + return all_tool_names == all_called_tool_names + + +def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turns=6, force_tools=False): """ Drive the multi-turn tool-call loop: 1. Send messages to model. @@ -172,8 +181,8 @@ def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turn msgs = list(messages) all_tool_calls: list[dict] = [] - for _ in range(max_turns): - result = chat_completion(url, msgs, tools=tools, stream=stream) + for t in range(max_turns): + result = chat_completion(url, msgs, tools=tools, stream=stream, force_tools=(force_tools and not all_tools_called(tools, all_tool_calls))) if result is None: return all_tool_calls, None @@ -235,10 +244,10 @@ def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turn # --------------------------------------------------------------------------- -def run_test(url, test_case, stream): +def run_test(url, test_case, stream, force_tools): name = test_case["name"] mode = f"{'stream' if stream else 'non-stream'}" - print_header(f"{name} [{mode}]") + print_header(f"{name} [{mode}, force_tools={force_tools}] ") all_tool_calls, final_content = run_agentic_loop( url, @@ -246,6 +255,7 @@ def run_test(url, test_case, stream): tools=test_case["tools"], mock_tool_responses=test_case["mock_tool_responses"], stream=stream, + force_tools=force_tools ) if final_content is None and not all_tool_calls: @@ -1093,6 +1103,9 @@ def main(): parser.add_argument( "--stream-only", action="store_true", help="Only run streaming mode tests" ) + parser.add_argument( + "--force-tools", action="store_true", help="Change tool mode to forced instead of auto" + ) parser.add_argument( "--test", help="Run only the test whose name contains this substring (case-insensitive)", @@ -1103,10 +1116,13 @@ def main(): print_info(f"Testing server at {url}") modes = [] + force_tools = False if not args.stream_only: modes.append(False) if not args.no_stream: modes.append(True) + if args.force_tools: + force_tools = True cases: list[dict] = ALL_TEST_CASES if args.test: @@ -1121,7 +1137,7 @@ def main(): for stream in modes: for case in cases: total += 1 - if run_test(url, case, stream=stream): + if run_test(url, case, stream=stream, force_tools=force_tools): passed += 1 color = GREEN if passed == total else RED diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index e6a523664..ea9d87ebe 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -542,6 +542,36 @@ static common_chat_tool edit_tool{ })", }; +static common_chat_tool manage_todo_list_tool{ + /* .name = */ "manage_todo_list", + /* .description = */ "Create or update the todo list", + /* .parameters = */ R"({ + "type": "object", + "properties": { + "todos": { + "type": "array", + "description": "List of TODO list items" + } + }, + "required": ["todos"] + })", +}; + +static common_chat_tool run_in_terminal_tool{ + /* .name = */ "run_in_terminal", + /* .description = */ "Run a shell command.", + /* .parameters = */ R"({ + "type": "object", + "properties": { + "command": { + "type": "string", + "description": "Shell command to run" + } + }, + "required": ["command"] + })", +}; + static common_chat_tool magic_tool{ /* .name = */ "magic", /* .description = */ "Magic tool that takes a hash", @@ -1379,6 +1409,16 @@ class peg_test_builder { return *this; } + peg_test_builder & tool_choice(common_chat_tool_choice choice) { + tc_.params.tool_choice = choice; + return *this; + } + + peg_test_builder & messages(std::vector messages) { + tc_.params.messages = std::move(messages); + return *this; + } + // Execute the test void run() { // Check template filter @@ -1755,23 +1795,23 @@ static void test_template_output_peg_parsers(bool detailed_debug) { "hello()\n" "\n" "\n" - "" - ) + "") .enable_thinking(true) .reasoning_format(COMMON_REASONING_FORMAT_AUTO) .tools({ python_tool }) - .expect_reasoning("Let's call a tool: \n" - "\n" - "\n" - "def hello():\n" - " print(\"Not the real call!\")\n" - "\n" - "hello()\n" - "\n" - "\n" - "") + .expect_reasoning( + "Let's call a tool: \n" + "\n" + "\n" + "def hello():\n" + " print(\"Not the real call!\")\n" + "\n" + "hello()\n" + "\n" + "\n" + "") .expect_tool_calls({ { "python", "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}", {} }, }) @@ -1800,6 +1840,219 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .tools({ empty_args_tool_no_properties }) .expect(message_with_tool_calls("empty_args_no_props", "{}")) .run(); + + // Edge cases when reasoning traces are not sent + tst.test( + "\n\n\n\n" + "\n" + "\n" + "\n1\n\n" + "\n" + "") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .tools({ + special_function_tool + }) + .expect_reasoning("\n\n") + .expect_tool_calls({ { "special_function", "{\"arg1\": 1}", "" } }) + .run(); + + tst.test( + "\n\n" + "\n" + "\n" + "\n1\n\n" + "\n" + "") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .tools({ + special_function_tool + }) + .expect_reasoning("") + .expect_tool_calls({ { "special_function", "{\"arg1\": 1}", "" } }) + .run(); + + tst.test( + "\n\n" + "\n" + "\n" + "\n" + "pwd\n" + "\n" + "\n" + "") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .tools({ + run_in_terminal_tool + }) + .expect_tool_calls({ + { "run_in_terminal", R"({"command": "pwd"})", {} }, + }) + .run(); + + tst.test( + "\n\n" + "Let me inspect the current directory.\n" + "\n" + "\n" + "\n" + "pwd\n" + "\n" + "\n" + "") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .tools({ + run_in_terminal_tool + }) + .expect_content("Let me inspect the current directory.\n") + .expect_tool_calls({ + { "run_in_terminal", R"({"command": "pwd"})", {} }, + }) + .run(); + + tst.test( + "\n\n" + "Let me inspect the current directory.\n" + "\n" + "\n" + "\n" + "pwd\n" + "\n" + "\n" + "") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .tools({ + run_in_terminal_tool + }) + .tool_choice(COMMON_CHAT_TOOL_CHOICE_REQUIRED) + .expect_content("Let me inspect the current directory.\n") + .expect_tool_calls({ + { "run_in_terminal", R"({"command": "pwd"})", {} }, + }) + .run(); + + tst.test( + "I should inspect the directory.\n" + "\n\n" + "Let me inspect it now.\n" + "\n" + "\n" + "\n" + "pwd\n" + "\n" + "\n" + "") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .tools({ + run_in_terminal_tool + }) + .expect_reasoning("I should inspect the directory.") + .expect_content("Let me inspect it now.\n") + .expect_tool_calls({ + { "run_in_terminal", R"({"command": "pwd"})", {} }, + }) + .run(); + + tst.test( + "I might call later, but I am still thinking.\n" + "\n\n" + "Final answer without tools.") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .tools({ run_in_terminal_tool }) + .expect_reasoning("I might call later, but I am still thinking.") + .expect_content("Final answer without tools.") + .run(); + + { + common_chat_msg user_start; + user_start.role = "user"; + user_start.content = "Create a todo list, then inspect the repository."; + + common_chat_msg assistant_todos = + simple_assist_msg("", "", "manage_todo_list", + R"({"todos":[{"item":"Inspect repository","selected":false}]})", "call_todos"); + + common_chat_msg tool_result; + tool_result.role = "tool"; + tool_result.content = "Successfully wrote todo list"; + tool_result.tool_call_id = "call_todos"; + + common_chat_msg user_continue; + user_continue.role = "user"; + user_continue.content = "Proceed."; + + tst.test( + "I need to run a terminal command.\n" + "\n\n" + "\n" + "\n" + "\n" + "pwd\n" + "\n" + "\n" + "") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .tools({ + manage_todo_list_tool, run_in_terminal_tool + }) + .messages({ user_start, assistant_todos, tool_result, user_continue }) + .expect_reasoning("I need to run a terminal command.") + .expect_tool_calls({ + { "run_in_terminal", R"({"command": "pwd"})", {} }, + }) + .run(); + + tst.test( + "I need to run a terminal command.\n" + "\n\n" + "Let me inspect the current directory.\n" + "\n" + "\n" + "\n" + "pwd\n" + "\n" + "\n" + "") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .tools({ + manage_todo_list_tool, run_in_terminal_tool + }) + .tool_choice(COMMON_CHAT_TOOL_CHOICE_REQUIRED) + .messages({ user_start, assistant_todos, tool_result, user_continue }) + .expect_reasoning("I need to run a terminal command.") + .expect_content("Let me inspect the current directory.\n") + .expect_tool_calls({ + { "run_in_terminal", R"({"command": "pwd"})", {} }, + }) + .run(); + + tst.test( + "\n\n" + "\n" + "\n" + "\n" + "pwd\n" + "\n" + "\n" + "") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .enable_thinking(true) + .tools({ + manage_todo_list_tool, run_in_terminal_tool + }) + .messages({ user_start, assistant_todos, tool_result, user_continue }) + .expect_tool_calls({ + { "run_in_terminal", R"({"command": "pwd"})", {} }, + }) + .run(); + } } { diff --git a/tests/test-reasoning-budget.cpp b/tests/test-reasoning-budget.cpp index f7a601789..747d24644 100644 --- a/tests/test-reasoning-budget.cpp +++ b/tests/test-reasoning-budget.cpp @@ -70,20 +70,20 @@ static void test_reasoning_budget( llama_sampler_apply(sampler, &cur_p); // Check if forcing is active (all logits except one should be -INFINITY) - size_t finite_count = 0; - llama_token finite_token = -1; + size_t not_neg_inf = 0; + llama_token not_neg_inf_token = -1; for (size_t j = 0; j < cur.size(); j++) { - if (std::isfinite(cur[j].logit)) { - finite_count++; - finite_token = cur[j].id; + if (std::isfinite(cur[j].logit) || cur[j].logit > 0) { // +INFINITY + not_neg_inf++; + not_neg_inf_token = cur[j].id; } } llama_sampler_accept(sampler, sequence[i]); - fprintf(stderr, " i=%zu: token=%d, finite_count=%zu, finite_token=%d\n", i, (int)sequence[i], finite_count, (int)finite_token); + fprintf(stderr, " i=%zu: token=%d, not_neg_inf_count=%zu, not_neg_inf_token=%d\n", i, (int)sequence[i], not_neg_inf, (int)not_neg_inf_token); - if (finite_count == 1) { + if (not_neg_inf == 1) { if (actual_force_start == SIZE_MAX) { actual_force_start = i; } diff --git a/tools/server/tests/unit/test_tool_call.py b/tools/server/tests/unit/test_tool_call.py index b1a5ab9da..9fa84d165 100755 --- a/tools/server/tests/unit/test_tool_call.py +++ b/tools/server/tests/unit/test_tool_call.py @@ -126,69 +126,70 @@ def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict actual_arguments = json.loads(actual_arguments) assert argument_key in actual_arguments, f"tool arguments: {actual_arguments}, expected: {argument_key}" +# PR #22654: commented out since we're now allowing content before tool calls in tool_call: required, so we can't force this +# in the tiny model just by using the grammar +# +# @pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED]) +# @pytest.mark.parametrize("template_name,tool,argument_key", [ +# ("Qwen3-Coder", TEST_TOOL, "success"), +# ("Qwen3-Coder", TEST_TOOL, "success"), +# ("meta-llama-Llama-3.3-70B-Instruct", TEST_TOOL, "success"), +# ("meta-llama-Llama-3.3-70B-Instruct", TEST_TOOL, "success"), +# ("meta-llama-Llama-3.3-70B-Instruct", PYTHON_TOOL, "code"), +# ("meta-llama-Llama-3.3-70B-Instruct", PYTHON_TOOL, "code"), +# ]) +# def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode): +# global server +# n_predict = 1024 +# # server = ServerPreset.stories15m_moe() +# server.jinja = True +# server.n_predict = n_predict +# server.chat_template_file = f'../../../models/templates/{template_name}.jinja' +# server.start() +# do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED, temperature=0.0, top_k=1, top_p=1.0) -@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED]) -@pytest.mark.parametrize("template_name,tool,argument_key", [ - ("Qwen3-Coder", TEST_TOOL, "success"), - ("Qwen3-Coder", TEST_TOOL, "success"), - ("meta-llama-Llama-3.3-70B-Instruct", TEST_TOOL, "success"), - ("meta-llama-Llama-3.3-70B-Instruct", TEST_TOOL, "success"), - ("meta-llama-Llama-3.3-70B-Instruct", PYTHON_TOOL, "code"), - ("meta-llama-Llama-3.3-70B-Instruct", PYTHON_TOOL, "code"), -]) -def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode): - global server - n_predict = 1024 - # server = ServerPreset.stories15m_moe() - server.jinja = True - server.n_predict = n_predict - server.chat_template_file = f'../../../models/templates/{template_name}.jinja' - server.start() - do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED, temperature=0.0, top_k=1, top_p=1.0) +# @pytest.mark.slow +# @pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED]) +# @pytest.mark.parametrize("template_name,tool,argument_key", [ +# ("meta-llama-Llama-3.1-8B-Instruct", TEST_TOOL, "success"), +# ("meta-llama-Llama-3.1-8B-Instruct", PYTHON_TOOL, "code"), +# ("meetkai-functionary-medium-v3.1", TEST_TOOL, "success"), +# ("meetkai-functionary-medium-v3.1", PYTHON_TOOL, "code"), -@pytest.mark.slow -@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED]) -@pytest.mark.parametrize("template_name,tool,argument_key", [ - ("meta-llama-Llama-3.1-8B-Instruct", TEST_TOOL, "success"), - ("meta-llama-Llama-3.1-8B-Instruct", PYTHON_TOOL, "code"), +# ("meetkai-functionary-medium-v3.2", TEST_TOOL, "success"), +# # Functionary v3.2 format supports raw python content, which w/ a dummy stories model will never end on its own. +# # ("meetkai-functionary-medium-v3.2", PYTHON_TOOL, "code"), - ("meetkai-functionary-medium-v3.1", TEST_TOOL, "success"), - ("meetkai-functionary-medium-v3.1", PYTHON_TOOL, "code"), +# ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", TEST_TOOL, "success"), +# ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", PYTHON_TOOL, "code"), - ("meetkai-functionary-medium-v3.2", TEST_TOOL, "success"), - # Functionary v3.2 format supports raw python content, which w/ a dummy stories model will never end on its own. - # ("meetkai-functionary-medium-v3.2", PYTHON_TOOL, "code"), +# ("meta-llama-Llama-3.2-3B-Instruct", TEST_TOOL, "success"), +# ("meta-llama-Llama-3.2-3B-Instruct", PYTHON_TOOL, "code"), - ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", TEST_TOOL, "success"), - ("NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use", PYTHON_TOOL, "code"), +# ("mistralai-Mistral-Nemo-Instruct-2407", TEST_TOOL, "success"), +# ("mistralai-Mistral-Nemo-Instruct-2407", PYTHON_TOOL, "code"), - ("meta-llama-Llama-3.2-3B-Instruct", TEST_TOOL, "success"), - ("meta-llama-Llama-3.2-3B-Instruct", PYTHON_TOOL, "code"), +# ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use", TEST_TOOL, "success"), +# ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use", PYTHON_TOOL, "code"), - ("mistralai-Mistral-Nemo-Instruct-2407", TEST_TOOL, "success"), - ("mistralai-Mistral-Nemo-Instruct-2407", PYTHON_TOOL, "code"), +# ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", TEST_TOOL, "success"), +# ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", PYTHON_TOOL, "code"), - ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use", TEST_TOOL, "success"), - ("NousResearch-Hermes-3-Llama-3.1-8B-tool_use", PYTHON_TOOL, "code"), +# ("fireworks-ai-llama-3-firefunction-v2", TEST_TOOL, "success"), +# # ("fireworks-ai-llama-3-firefunction-v2", PYTHON_TOOL, "codeFalse), True), +# # ("fireworks-ai-llama-3-firefunction-v2", PYTHON_TOOL, "code"), - ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", TEST_TOOL, "success"), - ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", PYTHON_TOOL, "code"), - - ("fireworks-ai-llama-3-firefunction-v2", TEST_TOOL, "success"), - # ("fireworks-ai-llama-3-firefunction-v2", PYTHON_TOOL, "codeFalse), True), - # ("fireworks-ai-llama-3-firefunction-v2", PYTHON_TOOL, "code"), - -]) -def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode): - global server - n_predict = 512 - # server = ServerPreset.stories15m_moe() - server.jinja = True - server.n_predict = n_predict - server.chat_template_file = f'../../../models/templates/{template_name}.jinja' - server.start(timeout_seconds=TIMEOUT_START_SLOW) - do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED) +# ]) +# def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None, stream: CompletionMode): +# global server +# n_predict = 512 +# # server = ServerPreset.stories15m_moe() +# server.jinja = True +# server.n_predict = n_predict +# server.chat_template_file = f'../../../models/templates/{template_name}.jinja' +# server.start(timeout_seconds=TIMEOUT_START_SLOW) +# do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, stream=stream == CompletionMode.STREAMED) @pytest.mark.slow