mirror of
https://github.com/unslothai/unsloth.git
synced 2026-04-28 03:19:57 +00:00
fix(llm_assist): disable thinking mode for helper model JSON output (#4358)
* fix(llm_assist): disable thinking mode for helper model JSON output Pass enable_thinking=False to generate_chat_completion() in both _run_with_helper() and _generate_with_backend() so the Qwen3.5-4B helper model produces clean JSON instead of wrapping responses in <think> tags. * fix(llm_assist): log per-request enable_thinking=False override Add info-level log lines so the user can see that each helper/advisor request overrides the server-level thinking default to False. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
parent
a0aba96ebd
commit
c6bd55ec61
1 changed files with 6 additions and 0 deletions
|
|
@ -143,6 +143,9 @@ def _run_with_helper(prompt: str, max_tokens: int = 256) -> Optional[str]:
|
|||
return None
|
||||
|
||||
messages = [{"role": "user", "content": prompt}]
|
||||
logger.info(
|
||||
"Helper model request: enable_thinking=False (per-request override)"
|
||||
)
|
||||
cumulative = ""
|
||||
for text in backend.generate_chat_completion(
|
||||
messages = messages,
|
||||
|
|
@ -151,6 +154,7 @@ def _run_with_helper(prompt: str, max_tokens: int = 256) -> Optional[str]:
|
|||
top_k = 20,
|
||||
max_tokens = max_tokens,
|
||||
repetition_penalty = 1.0,
|
||||
enable_thinking = False,
|
||||
):
|
||||
cumulative = text # cumulative — last value is full text
|
||||
|
||||
|
|
@ -416,6 +420,7 @@ def _parse_json_response(text: str) -> Optional[dict]:
|
|||
|
||||
def _generate_with_backend(backend, messages: list[dict], max_tokens: int = 512) -> str:
|
||||
"""Run one chat completion on an already-loaded backend. Returns raw text."""
|
||||
logger.info("Advisor request: enable_thinking=False (per-request override)")
|
||||
cumulative = ""
|
||||
for text in backend.generate_chat_completion(
|
||||
messages = messages,
|
||||
|
|
@ -424,6 +429,7 @@ def _generate_with_backend(backend, messages: list[dict], max_tokens: int = 512)
|
|||
top_k = 20,
|
||||
max_tokens = max_tokens,
|
||||
repetition_penalty = 1.0,
|
||||
enable_thinking = False,
|
||||
):
|
||||
cumulative = text
|
||||
result = cumulative.strip()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue