mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-17 04:09:19 +00:00
* move conversion code to a dedicated conversion directory and split the files akin to the src/models architecture --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
58 lines
2.6 KiB
Python
58 lines
2.6 KiB
Python
from __future__ import annotations
|
|
|
|
from typing import Iterable, TYPE_CHECKING
|
|
|
|
import torch
|
|
|
|
if TYPE_CHECKING:
|
|
from torch import Tensor
|
|
|
|
from .base import ModelBase, TextModel, gguf
|
|
|
|
|
|
@ModelBase.register("FalconForCausalLM", "RWForCausalLM")
|
|
class FalconModel(TextModel):
|
|
model_arch = gguf.MODEL_ARCH.FALCON
|
|
|
|
def set_gguf_parameters(self):
|
|
n_head = self.hparams.get("num_attention_heads")
|
|
if n_head is None:
|
|
n_head = self.hparams["n_head"] # old name
|
|
|
|
n_head_kv = self.hparams.get("num_kv_heads")
|
|
if n_head_kv is None:
|
|
n_head_kv = self.hparams.get("n_head_kv", 1) # old name
|
|
|
|
self.gguf_writer.add_context_length(2048) # not in config.json
|
|
self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
|
|
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
|
self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"])
|
|
self.gguf_writer.add_block_count(self.block_count)
|
|
self.gguf_writer.add_head_count(n_head)
|
|
self.gguf_writer.add_head_count_kv(n_head_kv)
|
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
|
self.gguf_writer.add_file_type(self.ftype)
|
|
|
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
# QKV tensor transform
|
|
# The original query_key_value tensor contains n_head_kv "kv groups",
|
|
# each consisting of n_head/n_head_kv query weights followed by one key
|
|
# and one value weight (shared by all query heads in the kv group).
|
|
# This layout makes it a big pain to work with in GGML.
|
|
# So we rearrange them here,, so that we have n_head query weights
|
|
# followed by n_head_kv key weights followed by n_head_kv value weights,
|
|
# in contiguous fashion.
|
|
# ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
|
|
|
|
if "query_key_value" in name:
|
|
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
|
n_head_kv = self.find_hparam(["num_kv_heads", "n_head_kv"], optional=True) or 1
|
|
head_dim = self.hparams["hidden_size"] // n_head
|
|
|
|
qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
|
|
q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head)
|
|
k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
|
|
v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
|
|
data_torch = torch.cat((q, k, v)).reshape_as(data_torch)
|
|
|
|
yield from super().modify_tensors(data_torch, name, bid)
|