mirror of
https://github.com/unslothai/unsloth.git
synced 2026-04-28 03:19:57 +00:00
Fix tokenizer save gemma (#5115)
* [WIP] Fast inference for qwen3.5
* fix tokenizer not saving properly
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* extend to VLM and clenaup
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* gate tokenizer.model saving
* fix for gated/private models
* Fix tokenizer save review findings
- save.py:261 restore dict-based _TOKENIZER_MODEL_CACHE so negative
results are cached; the set() in 0129fb5e regressed non-SentencePiece
tokenizer saves to a fresh HfApi.model_info call on every checkpoint.
Don't cache on exception so gated/private repos can retry later with a
valid token.
- save.py:282 guard `repo_info.siblings` with `or []`; huggingface_hub
types this Optional and returns None for empty or new repos, which
made any() raise TypeError out of save_pretrained.
- save.py:3487 split push_to_hub into local save + _preserve + push so
uploaded tokenizer_config.json/tokenizer.model include the fix rather
than the unfixed copies written before the upload.
- save.py:3352 call patch_saving_functions on tokenizers passed to
unsloth_save_pretrained_torchao to match the other three save
entrypoints; previously torchao saves skipped the preservation patch.
* Fix push_to_hub repo_id conflict and torchao token forwarding
- save.py:3493-3496 pop `repo_id` from kwargs (defaulting to
`save_directory`) before calling `self.push_to_hub(repo_id, **kwargs)`.
The previous `self.push_to_hub(save_directory, **kwargs)` passed
`save_directory` as the first positional `repo_id` while also
forwarding a user-supplied `repo_id` through kwargs, raising
`TypeError: got multiple values for argument 'repo_id'` on the
standard `save_pretrained(local_path, push_to_hub=True, repo_id=...)`
call shape. This regression was introduced by the earlier iteration
that split push_to_hub into an explicit second step.
- save.py:3314 forward `token=token` on the torchao non-PEFT
`tokenizer.save_pretrained(torchao_save_directory)` call so the
patched wrapper can reach gated repos when HF_TOKEN is not in the
environment. Left the sibling `unsloth_generic_save` call at 3063
untouched (blame points at an earlier full-finetuned
save_pretrained_merged fix and the token gap there is lower risk).
* Fix torchao tokenizer reload and push_to_hub repo_id default
- save.py:3283 after `auto_processor.from_pretrained(save_directory)`
re-runs `patch_saving_functions(tokenizer)` on the freshly loaded
tokenizer. The rebind at 3283 was overwriting the patched tokenizer
passed into `unsloth_save_pretrained_torchao`, so the subsequent
`tokenizer.push_to_hub` (3309) and `tokenizer.save_pretrained`
(3314) bypassed `_preserve_sentencepiece_tokenizer_assets` and left
`{save_directory}-torchao` without `tokenizer.model` / restored
`added_tokens_decoder`.
- save.py:3497 fall back to `os.path.basename(save_directory)` for
`repo_id` instead of the raw `save_directory`. The round-2 fallback
diverged from `transformers.PreTrainedTokenizerBase.save_pretrained`,
which defaults `repo_id = save_directory.split(os.path.sep)[-1]`;
nested local paths like `./out/my-repo` now resolve to `my-repo`
(the Hub id) instead of the full filesystem path.
* Revert tokenizer save_pretrained repo_id basename fallback
- save.py:3497 default `repo_id` back to `save_directory` as-is rather
than `os.path.basename(save_directory)`. The basename fallback (added
last iteration to match upstream transformers) stripped the user
namespace from the Unsloth convention `tokenizer.save_pretrained(
"user/repo", push_to_hub=True)`, redirecting the upload to
`{current_user}/repo`. save.py itself treats `save_directory` as the
repo id at 572, 593, 1723, 1779, 1836, 1844, 1858, and 3025, so the
wrapper should follow the same convention. Users who pass a nested
filesystem path with `push_to_hub=True` can supply explicit
`repo_id=...`.
* Guard processor.tokenizer recursion against None
save.py:3511 change `elif hasattr(model, "tokenizer")` to
`elif getattr(model, "tokenizer", None) is not None`. The previous
guard only checked attribute existence; a ProcessorMixin that sets
`tokenizer = None` (audio-only or manually constructed) would enter
the branch and crash inside the recursive patch_saving_functions on
`model.push_to_hub.__name__`.
* Add review tests for tokenizer save
* Consolidate review tests
Drop redundant assertion in test_patch_saving_functions_still_patches_non_none_tokenizer.
The hasattr check already proves the patch applied; the or-chained
repeat assertion added no signal.
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---------
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Daniel Han <danielhanchen@gmail.com>
This commit is contained in:
parent
3011535871
commit
77756faa46
2 changed files with 233 additions and 18 deletions
47
tests/saving/test_patch_saving_none_tokenizer.py
Normal file
47
tests/saving/test_patch_saving_none_tokenizer.py
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
from unittest.mock import MagicMock
|
||||
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from unsloth.save import patch_saving_functions
|
||||
|
||||
|
||||
class _ProcessorWithNoneTokenizer:
|
||||
tokenizer = None
|
||||
|
||||
def push_to_hub(self, *args, **kwargs):
|
||||
return None
|
||||
|
||||
push_to_hub.__doc__ = "stub"
|
||||
|
||||
def save_pretrained(self, *args, **kwargs):
|
||||
return None
|
||||
|
||||
|
||||
def test_patch_saving_functions_no_crash_on_none_tokenizer():
|
||||
proc = _ProcessorWithNoneTokenizer()
|
||||
patch_saving_functions(proc)
|
||||
|
||||
|
||||
def test_patch_saving_functions_still_patches_non_none_tokenizer():
|
||||
inner = MagicMock(spec = PreTrainedTokenizerBase)
|
||||
inner.save_pretrained = MagicMock()
|
||||
inner.save_pretrained.__name__ = "save_pretrained"
|
||||
inner.push_to_hub = MagicMock()
|
||||
inner.push_to_hub.__name__ = "push_to_hub"
|
||||
inner.push_to_hub.__doc__ = "tokenizer doc"
|
||||
|
||||
class _Proc:
|
||||
def __init__(self, tok):
|
||||
self.tokenizer = tok
|
||||
|
||||
def push_to_hub(self, *args, **kwargs):
|
||||
return None
|
||||
|
||||
push_to_hub.__doc__ = "proc doc"
|
||||
|
||||
def save_pretrained(self, *args, **kwargs):
|
||||
return None
|
||||
|
||||
proc = _Proc(inner)
|
||||
patch_saving_functions(proc)
|
||||
assert hasattr(inner, "original_save_pretrained")
|
||||
204
unsloth/save.py
204
unsloth/save.py
|
|
@ -41,6 +41,7 @@ import sys
|
|||
import requests
|
||||
import torch
|
||||
import os
|
||||
import json
|
||||
import shutil
|
||||
import pickle
|
||||
import gc
|
||||
|
|
@ -50,11 +51,10 @@ import subprocess
|
|||
import psutil
|
||||
import re
|
||||
from transformers.models.llama.modeling_llama import logger
|
||||
from .tokenizer_utils import fix_sentencepiece_gguf
|
||||
from .models.loader_utils import get_model_name
|
||||
from .models._utils import _convert_torchao_model
|
||||
from .ollama_template_mappers import OLLAMA_TEMPLATES, MODEL_TO_OLLAMA_TEMPLATE_MAPPER
|
||||
from transformers import ProcessorMixin
|
||||
from transformers import ProcessorMixin, PreTrainedTokenizerBase
|
||||
from huggingface_hub import HfApi
|
||||
|
||||
try:
|
||||
|
|
@ -258,6 +258,105 @@ def check_if_sentencepiece_model(
|
|||
return sentencepiece_model
|
||||
|
||||
|
||||
_TOKENIZER_MODEL_CACHE = {}
|
||||
|
||||
|
||||
def _has_tokenizer_model(tokenizer, token = None):
|
||||
tokenizer = tokenizer.tokenizer if hasattr(tokenizer, "tokenizer") else tokenizer
|
||||
if tokenizer is None:
|
||||
return False
|
||||
|
||||
source = getattr(tokenizer, "name_or_path", None)
|
||||
if not isinstance(source, str) or not source:
|
||||
return False
|
||||
if os.path.isdir(source):
|
||||
return os.path.isfile(os.path.join(source, "tokenizer.model"))
|
||||
if source in _TOKENIZER_MODEL_CACHE:
|
||||
return _TOKENIZER_MODEL_CACHE[source]
|
||||
|
||||
try:
|
||||
repo_info = HfApi(token = token).model_info(source, files_metadata = False)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
has_tokenizer_model = any(
|
||||
sibling.rfilename == "tokenizer.model" for sibling in (repo_info.siblings or [])
|
||||
)
|
||||
_TOKENIZER_MODEL_CACHE[source] = has_tokenizer_model
|
||||
return has_tokenizer_model
|
||||
|
||||
|
||||
def _preserve_sentencepiece_tokenizer_assets(
|
||||
tokenizer,
|
||||
save_directory,
|
||||
token = None,
|
||||
):
|
||||
tokenizer = tokenizer.tokenizer if hasattr(tokenizer, "tokenizer") else tokenizer
|
||||
if tokenizer is None or not os.path.isdir(save_directory):
|
||||
return
|
||||
|
||||
tokenizer_config_path = os.path.join(save_directory, "tokenizer_config.json")
|
||||
if os.path.isfile(tokenizer_config_path):
|
||||
desired_added_tokens_decoder = {}
|
||||
for token_id, added_token in getattr(
|
||||
tokenizer, "added_tokens_decoder", {}
|
||||
).items():
|
||||
desired_added_tokens_decoder[str(token_id)] = {
|
||||
"content": getattr(added_token, "content", str(added_token)),
|
||||
"single_word": getattr(added_token, "single_word", False),
|
||||
"lstrip": getattr(added_token, "lstrip", False),
|
||||
"rstrip": getattr(added_token, "rstrip", False),
|
||||
"normalized": getattr(added_token, "normalized", True),
|
||||
"special": getattr(added_token, "special", False),
|
||||
}
|
||||
if desired_added_tokens_decoder:
|
||||
with open(tokenizer_config_path, "r", encoding = "utf-8") as file:
|
||||
tokenizer_config = json.load(file)
|
||||
if (
|
||||
tokenizer_config.get("added_tokens_decoder")
|
||||
!= desired_added_tokens_decoder
|
||||
):
|
||||
tokenizer_config["added_tokens_decoder"] = desired_added_tokens_decoder
|
||||
with open(tokenizer_config_path, "w", encoding = "utf-8") as file:
|
||||
json.dump(tokenizer_config, file, indent = 2, ensure_ascii = False)
|
||||
file.write("\n")
|
||||
logger.warning_once(
|
||||
f"Unsloth: Restored added_tokens_decoder metadata in "
|
||||
f"{tokenizer_config_path}."
|
||||
)
|
||||
|
||||
tokenizer_model = os.path.join(save_directory, "tokenizer.model")
|
||||
downloaded_path = None
|
||||
if not os.path.isfile(tokenizer_model) and _has_tokenizer_model(
|
||||
tokenizer,
|
||||
token = token,
|
||||
):
|
||||
source = getattr(tokenizer, "name_or_path", None)
|
||||
if isinstance(source, str) and source:
|
||||
if os.path.isdir(source):
|
||||
local_path = os.path.join(source, "tokenizer.model")
|
||||
if os.path.isfile(local_path):
|
||||
downloaded_path = local_path
|
||||
else:
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
try:
|
||||
downloaded_path = hf_hub_download(
|
||||
repo_id = source,
|
||||
filename = "tokenizer.model",
|
||||
token = token,
|
||||
)
|
||||
except Exception:
|
||||
downloaded_path = None
|
||||
|
||||
if not os.path.isfile(tokenizer_model) and downloaded_path is not None:
|
||||
shutil.copy2(downloaded_path, tokenizer_model)
|
||||
logger.warning_once(
|
||||
f"Unsloth: Preserved sentencepiece asset `tokenizer.model` in "
|
||||
f"{save_directory}."
|
||||
)
|
||||
|
||||
|
||||
def _free_cached_model(model):
|
||||
from huggingface_hub import scan_cache_dir
|
||||
|
||||
|
|
@ -353,6 +452,9 @@ def unsloth_save_model(
|
|||
maximum_memory_usage: float = 0.9,
|
||||
datasets: Optional[List[str]] = None,
|
||||
):
|
||||
if isinstance(tokenizer, (PreTrainedTokenizerBase, ProcessorMixin)):
|
||||
tokenizer = patch_saving_functions(tokenizer)
|
||||
|
||||
if token is None:
|
||||
token = get_token()
|
||||
|
||||
|
|
@ -480,8 +582,11 @@ def unsloth_save_model(
|
|||
)
|
||||
if tokenizer is not None:
|
||||
# Set padding side to left for inference
|
||||
old_padding_side = tokenizer.padding_side
|
||||
tokenizer.padding_side = "left"
|
||||
_tokenizer = (
|
||||
tokenizer.tokenizer if hasattr(tokenizer, "tokenizer") else tokenizer
|
||||
)
|
||||
old_padding_side = _tokenizer.padding_side
|
||||
_tokenizer.padding_side = "left"
|
||||
|
||||
getattr(tokenizer, "original_push_to_hub", tokenizer.push_to_hub)(
|
||||
repo_id = save_directory,
|
||||
|
|
@ -498,7 +603,7 @@ def unsloth_save_model(
|
|||
)
|
||||
|
||||
# Revert back padding side
|
||||
tokenizer.padding_side = old_padding_side
|
||||
_tokenizer.padding_side = old_padding_side
|
||||
|
||||
if hasattr(model, "config"):
|
||||
print(
|
||||
|
|
@ -579,13 +684,16 @@ def unsloth_save_model(
|
|||
print("Unsloth: Saving tokenizer...", end = "")
|
||||
|
||||
# Set padding side to left for inference
|
||||
old_padding_side = tokenizer.padding_side
|
||||
tokenizer.padding_side = "left"
|
||||
_tokenizer = (
|
||||
tokenizer.tokenizer if hasattr(tokenizer, "tokenizer") else tokenizer
|
||||
)
|
||||
old_padding_side = _tokenizer.padding_side
|
||||
_tokenizer.padding_side = "left"
|
||||
|
||||
tokenizer.save_pretrained(**tokenizer_save_settings)
|
||||
|
||||
# Revert back padding side
|
||||
tokenizer.padding_side = old_padding_side
|
||||
_tokenizer.padding_side = old_padding_side
|
||||
|
||||
print(" Done.")
|
||||
else:
|
||||
|
|
@ -865,13 +973,16 @@ def unsloth_save_model(
|
|||
print("Unsloth: Saving tokenizer...", end = "")
|
||||
|
||||
# Set padding side to left for inference
|
||||
old_padding_side = tokenizer.padding_side
|
||||
tokenizer.padding_side = "left"
|
||||
_tokenizer = (
|
||||
tokenizer.tokenizer if hasattr(tokenizer, "tokenizer") else tokenizer
|
||||
)
|
||||
old_padding_side = _tokenizer.padding_side
|
||||
_tokenizer.padding_side = "left"
|
||||
|
||||
tokenizer.save_pretrained(**tokenizer_save_settings)
|
||||
|
||||
# Revert back padding side
|
||||
tokenizer.padding_side = old_padding_side
|
||||
_tokenizer.padding_side = old_padding_side
|
||||
|
||||
print(" Done.")
|
||||
else:
|
||||
|
|
@ -1993,6 +2104,8 @@ def unsloth_save_pretrained_gguf(
|
|||
"""
|
||||
if tokenizer is None:
|
||||
raise ValueError("Unsloth: Saving to GGUF must have a tokenizer.")
|
||||
if isinstance(tokenizer, (PreTrainedTokenizerBase, ProcessorMixin)):
|
||||
tokenizer = patch_saving_functions(tokenizer)
|
||||
|
||||
try:
|
||||
base_model_name = get_model_name(self.config._name_or_path, load_in_4bit = False)
|
||||
|
|
@ -2865,6 +2978,9 @@ def unsloth_generic_save(
|
|||
maximum_memory_usage: float = 0.9,
|
||||
datasets: Optional[List[str]] = None,
|
||||
):
|
||||
if isinstance(tokenizer, (PreTrainedTokenizerBase, ProcessorMixin)):
|
||||
tokenizer = patch_saving_functions(tokenizer)
|
||||
|
||||
if token is None and push_to_hub:
|
||||
token = get_token()
|
||||
|
||||
|
|
@ -2916,8 +3032,13 @@ def unsloth_generic_save(
|
|||
**_save_kwargs,
|
||||
)
|
||||
if tokenizer is not None:
|
||||
old_padding_side = tokenizer.padding_side
|
||||
tokenizer.padding_side = "left"
|
||||
_tokenizer = (
|
||||
tokenizer.tokenizer
|
||||
if hasattr(tokenizer, "tokenizer")
|
||||
else tokenizer
|
||||
)
|
||||
old_padding_side = _tokenizer.padding_side
|
||||
_tokenizer.padding_side = "left"
|
||||
tokenizer.push_to_hub(
|
||||
save_directory,
|
||||
token = token,
|
||||
|
|
@ -2926,15 +3047,20 @@ def unsloth_generic_save(
|
|||
create_pr = create_pr,
|
||||
revision = revision,
|
||||
)
|
||||
tokenizer.padding_side = old_padding_side
|
||||
_tokenizer.padding_side = old_padding_side
|
||||
else:
|
||||
print(f"Unsloth: Saving full fine-tuned model to '{save_directory}' ...")
|
||||
model.save_pretrained(save_directory, **_save_kwargs)
|
||||
if tokenizer is not None:
|
||||
old_padding_side = tokenizer.padding_side
|
||||
tokenizer.padding_side = "left"
|
||||
_tokenizer = (
|
||||
tokenizer.tokenizer
|
||||
if hasattr(tokenizer, "tokenizer")
|
||||
else tokenizer
|
||||
)
|
||||
old_padding_side = _tokenizer.padding_side
|
||||
_tokenizer.padding_side = "left"
|
||||
tokenizer.save_pretrained(save_directory)
|
||||
tokenizer.padding_side = old_padding_side
|
||||
_tokenizer.padding_side = old_padding_side
|
||||
|
||||
print(f"Unsloth: Model saved successfully to '{save_directory}'")
|
||||
else:
|
||||
|
|
@ -3154,6 +3280,8 @@ def _unsloth_save_torchao_with_given_config(
|
|||
auto_processor = AutoProcessor if is_vlm else AutoTokenizer
|
||||
|
||||
tokenizer = auto_processor.from_pretrained(save_directory)
|
||||
if isinstance(tokenizer, (PreTrainedTokenizerBase, ProcessorMixin)):
|
||||
tokenizer = patch_saving_functions(tokenizer)
|
||||
|
||||
# TorchAO must only use bfloat16 for loading (float16 fails)
|
||||
if HAS_TORCH_DTYPE:
|
||||
|
|
@ -3184,7 +3312,7 @@ def _unsloth_save_torchao_with_given_config(
|
|||
quantized_model.save_pretrained(
|
||||
torchao_save_directory, safe_serialization = safe_serialization
|
||||
)
|
||||
tokenizer.save_pretrained(torchao_save_directory)
|
||||
tokenizer.save_pretrained(torchao_save_directory, token = token)
|
||||
|
||||
# Clean up the intermediate unquantized model
|
||||
if os.path.exists(save_directory):
|
||||
|
|
@ -3223,6 +3351,9 @@ def unsloth_save_pretrained_torchao(
|
|||
`push_to_hub` (bool): whether to push to huggingface hub or save locally
|
||||
`token`: HuggingFace token for pushing to hub
|
||||
"""
|
||||
if isinstance(tokenizer, (PreTrainedTokenizerBase, ProcessorMixin)):
|
||||
tokenizer = patch_saving_functions(tokenizer)
|
||||
|
||||
if token is None and push_to_hub:
|
||||
token = get_token()
|
||||
|
||||
|
|
@ -3342,6 +3473,43 @@ def patch_saving_functions(model, vision = False):
|
|||
'''
|
||||
exec(push_to_hub_text, globals())
|
||||
|
||||
def unsloth_tokenizer_save_pretrained(
|
||||
self,
|
||||
save_directory,
|
||||
legacy_format = None,
|
||||
filename_prefix = None,
|
||||
push_to_hub = False,
|
||||
**kwargs,
|
||||
):
|
||||
result = self.original_save_pretrained(
|
||||
save_directory,
|
||||
legacy_format = legacy_format,
|
||||
filename_prefix = filename_prefix,
|
||||
push_to_hub = False,
|
||||
**kwargs,
|
||||
)
|
||||
_preserve_sentencepiece_tokenizer_assets(
|
||||
self,
|
||||
save_directory,
|
||||
token = kwargs.get("token", None),
|
||||
)
|
||||
if push_to_hub:
|
||||
push_kwargs = dict(kwargs)
|
||||
repo_id = push_kwargs.pop("repo_id", save_directory)
|
||||
self.push_to_hub(repo_id, **push_kwargs)
|
||||
return result
|
||||
|
||||
if (
|
||||
isinstance(model, PreTrainedTokenizerBase)
|
||||
and model.save_pretrained.__name__ != "unsloth_tokenizer_save_pretrained"
|
||||
):
|
||||
model.original_save_pretrained = model.save_pretrained
|
||||
model.save_pretrained = types.MethodType(
|
||||
unsloth_tokenizer_save_pretrained, model
|
||||
)
|
||||
elif getattr(model, "tokenizer", None) is not None:
|
||||
patch_saving_functions(model.tokenizer)
|
||||
|
||||
original_model = model
|
||||
while True:
|
||||
# Check if push_to_hub exists before accessing its __name__
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue