fix(llm_assist): disable thinking mode for helper model JSON output (#4358)

* fix(llm_assist): disable thinking mode for helper model JSON output

Pass enable_thinking=False to generate_chat_completion() in both
_run_with_helper() and _generate_with_backend() so the Qwen3.5-4B
helper model produces clean JSON instead of wrapping responses in
<think> tags.

* fix(llm_assist): log per-request enable_thinking=False override

Add info-level log lines so the user can see that each helper/advisor
request overrides the server-level thinking default to False.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
Roland Tannous 2026-03-17 15:58:08 +04:00 committed by GitHub
parent a0aba96ebd
commit c6bd55ec61
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -143,6 +143,9 @@ def _run_with_helper(prompt: str, max_tokens: int = 256) -> Optional[str]:
return None
messages = [{"role": "user", "content": prompt}]
logger.info(
"Helper model request: enable_thinking=False (per-request override)"
)
cumulative = ""
for text in backend.generate_chat_completion(
messages = messages,
@ -151,6 +154,7 @@ def _run_with_helper(prompt: str, max_tokens: int = 256) -> Optional[str]:
top_k = 20,
max_tokens = max_tokens,
repetition_penalty = 1.0,
enable_thinking = False,
):
cumulative = text # cumulative — last value is full text
@ -416,6 +420,7 @@ def _parse_json_response(text: str) -> Optional[dict]:
def _generate_with_backend(backend, messages: list[dict], max_tokens: int = 512) -> str:
"""Run one chat completion on an already-loaded backend. Returns raw text."""
logger.info("Advisor request: enable_thinking=False (per-request override)")
cumulative = ""
for text in backend.generate_chat_completion(
messages = messages,
@ -424,6 +429,7 @@ def _generate_with_backend(backend, messages: list[dict], max_tokens: int = 512)
top_k = 20,
max_tokens = max_tokens,
repetition_penalty = 1.0,
enable_thinking = False,
):
cumulative = text
result = cumulative.strip()