mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-09 16:44:35 +00:00
convert : support interns1-mini (#15412)
* support interns1-mini * fix comment * update
This commit is contained in:
parent
c247d06f38
commit
7da9fed0d6
1 changed files with 65 additions and 68 deletions
|
@ -1216,6 +1216,55 @@ class TextModel(ModelBase):
|
||||||
raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
|
raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
|
||||||
self.gguf_writer.add_pooling_type(pooling_type)
|
self.gguf_writer.add_pooling_type(pooling_type)
|
||||||
|
|
||||||
|
def _set_vocab_interns1(self):
|
||||||
|
tokens: list[str] = []
|
||||||
|
toktypes: list[int] = []
|
||||||
|
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
||||||
|
vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab())
|
||||||
|
vocab_size = self.hparams.get("vocab_size", len(vocab))
|
||||||
|
assert max(vocab.values()) < vocab_size
|
||||||
|
|
||||||
|
tokpre = self.get_vocab_base_pre(tokenizer)
|
||||||
|
|
||||||
|
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()}
|
||||||
|
added_vocab = tokenizer.get_added_vocab()
|
||||||
|
|
||||||
|
added_tokens_decoder = tokenizer.added_tokens_decoder
|
||||||
|
|
||||||
|
for i in range(vocab_size):
|
||||||
|
if i not in reverse_vocab:
|
||||||
|
tokens.append(f"[PAD{i}]")
|
||||||
|
toktypes.append(gguf.TokenType.UNUSED)
|
||||||
|
else:
|
||||||
|
token: str = reverse_vocab[i]
|
||||||
|
if token in added_vocab:
|
||||||
|
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
|
||||||
|
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
|
||||||
|
if not added_tokens_decoder[i].normalized:
|
||||||
|
previous_token = token
|
||||||
|
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
|
||||||
|
if previous_token != token:
|
||||||
|
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
|
||||||
|
|
||||||
|
if added_tokens_decoder[i].special or self.does_token_look_special(token):
|
||||||
|
toktypes.append(gguf.TokenType.CONTROL)
|
||||||
|
else:
|
||||||
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||||
|
else:
|
||||||
|
toktypes.append(gguf.TokenType.NORMAL)
|
||||||
|
tokens.append(token)
|
||||||
|
|
||||||
|
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||||
|
special_vocab._set_special_token("bos", 151643)
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
|
|
||||||
class MmprojModel(ModelBase):
|
class MmprojModel(ModelBase):
|
||||||
model_type = ModelType.MMPROJ
|
model_type = ModelType.MMPROJ
|
||||||
|
@ -2932,7 +2981,8 @@ class Qwen2Model(TextModel):
|
||||||
if "language_model." in name:
|
if "language_model." in name:
|
||||||
name = name.replace("language_model.", "") # for InternVL
|
name = name.replace("language_model.", "") # for InternVL
|
||||||
if name.startswith("mlp") or name.startswith("multi_modal_projector") \
|
if name.startswith("mlp") or name.startswith("multi_modal_projector") \
|
||||||
or name.startswith("vision_model") or name.startswith("audio_tower"):
|
or name.startswith("vision_model") or name.startswith("audio_tower") \
|
||||||
|
or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"):
|
||||||
# skip vision and audio tensors
|
# skip vision and audio tensors
|
||||||
return []
|
return []
|
||||||
yield from super().modify_tensors(data_torch, name, bid)
|
yield from super().modify_tensors(data_torch, name, bid)
|
||||||
|
@ -3604,6 +3654,19 @@ class Qwen2MoeModel(TextModel):
|
||||||
class Qwen3Model(Qwen2Model):
|
class Qwen3Model(Qwen2Model):
|
||||||
model_arch = gguf.MODEL_ARCH.QWEN3
|
model_arch = gguf.MODEL_ARCH.QWEN3
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
|
||||||
|
self.origin_hf_arch = hparams.get('architectures', [None])[0]
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
# deal with intern-s1-mini
|
||||||
|
if self.origin_hf_arch == 'InternS1ForConditionalGeneration':
|
||||||
|
self._set_vocab_interns1()
|
||||||
|
return
|
||||||
|
|
||||||
|
super().set_vocab()
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("Qwen3MoeForCausalLM")
|
@ModelBase.register("Qwen3MoeForCausalLM")
|
||||||
class Qwen3MoeModel(Qwen2MoeModel):
|
class Qwen3MoeModel(Qwen2MoeModel):
|
||||||
|
@ -3620,73 +3683,7 @@ class Qwen3MoeModel(Qwen2MoeModel):
|
||||||
self._set_vocab_interns1()
|
self._set_vocab_interns1()
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
super().set_vocab()
|
||||||
self._set_vocab_sentencepiece()
|
|
||||||
except FileNotFoundError:
|
|
||||||
self._set_vocab_gpt2()
|
|
||||||
|
|
||||||
def _set_vocab_interns1(self):
|
|
||||||
tokens: list[str] = []
|
|
||||||
toktypes: list[int] = []
|
|
||||||
|
|
||||||
from transformers import AutoTokenizer
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
|
|
||||||
vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab())
|
|
||||||
vocab_size = self.hparams.get("vocab_size", len(vocab))
|
|
||||||
assert max(vocab.values()) < vocab_size
|
|
||||||
|
|
||||||
tokpre = self.get_vocab_base_pre(tokenizer)
|
|
||||||
|
|
||||||
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()}
|
|
||||||
added_vocab = tokenizer.get_added_vocab()
|
|
||||||
|
|
||||||
added_tokens_decoder = tokenizer.added_tokens_decoder
|
|
||||||
|
|
||||||
for i in range(vocab_size):
|
|
||||||
if i not in reverse_vocab:
|
|
||||||
tokens.append(f"[PAD{i}]")
|
|
||||||
toktypes.append(gguf.TokenType.UNUSED)
|
|
||||||
else:
|
|
||||||
token: str = reverse_vocab[i]
|
|
||||||
if token in added_vocab:
|
|
||||||
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
|
|
||||||
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
|
|
||||||
if not added_tokens_decoder[i].normalized:
|
|
||||||
previous_token = token
|
|
||||||
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
|
|
||||||
if previous_token != token:
|
|
||||||
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
|
|
||||||
|
|
||||||
if added_tokens_decoder[i].special or self.does_token_look_special(token):
|
|
||||||
toktypes.append(gguf.TokenType.CONTROL)
|
|
||||||
else:
|
|
||||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
|
||||||
else:
|
|
||||||
toktypes.append(gguf.TokenType.NORMAL)
|
|
||||||
tokens.append(token)
|
|
||||||
|
|
||||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
|
||||||
self.gguf_writer.add_tokenizer_pre(tokpre)
|
|
||||||
self.gguf_writer.add_token_list(tokens)
|
|
||||||
self.gguf_writer.add_token_types(toktypes)
|
|
||||||
|
|
||||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
|
||||||
special_tokens_map_file = self.dir_model / 'special_tokens_map.json'
|
|
||||||
additional_special_tokens = []
|
|
||||||
if special_tokens_map_file.is_file():
|
|
||||||
with open(special_tokens_map_file, encoding = 'utf-8') as f:
|
|
||||||
additional_special_tokens = json.load(f).get('additional_special_tokens', [])
|
|
||||||
tokenizer_cfg_file = self.dir_model / 'special_tokens_map.json'
|
|
||||||
if tokenizer_cfg_file.is_file():
|
|
||||||
with open(tokenizer_cfg_file, encoding = 'utf-8') as f:
|
|
||||||
added_tokens_decoder = json.load(f).get('added_tokens_decoder', {})
|
|
||||||
token2ids_map = {data['content'] : int(token) for token, data in added_tokens_decoder.items() if data['special']}
|
|
||||||
for token in additional_special_tokens:
|
|
||||||
if token in token2ids_map:
|
|
||||||
special_vocab._set_special_token(token, token2ids_map[token])
|
|
||||||
special_vocab._set_special_token('eos', 151645)
|
|
||||||
special_vocab._set_special_token("bos", 151643)
|
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
|
||||||
|
|
||||||
|
|
||||||
@ModelBase.register("GPT2LMHeadModel")
|
@ModelBase.register("GPT2LMHeadModel")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue