mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # CMakeLists.txt # Makefile # ggml-metal.m
This commit is contained in:
commit
93d3871056
22 changed files with 341 additions and 198 deletions
17
.github/workflows/server.yml
vendored
17
.github/workflows/server.yml
vendored
|
@ -25,17 +25,14 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||||
build_type: [Debug, Release]
|
build_type: [Debug]
|
||||||
include:
|
include:
|
||||||
- build_type: Release
|
- build_type: Release
|
||||||
sanitizer: ""
|
sanitizer: ""
|
||||||
exclude:
|
- build_type: Debug
|
||||||
- build_type: Release
|
|
||||||
sanitizer: ADDRESS
|
|
||||||
- build_type: Release
|
|
||||||
sanitizer: THREAD
|
sanitizer: THREAD
|
||||||
- build_type: Release
|
disabled_on_pr: true
|
||||||
sanitizer: UNDEFINED
|
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
|
||||||
|
|
||||||
container:
|
container:
|
||||||
image: ubuntu:latest
|
image: ubuntu:latest
|
||||||
|
@ -81,13 +78,14 @@ jobs:
|
||||||
|
|
||||||
- name: Tests
|
- name: Tests
|
||||||
id: server_integration_tests
|
id: server_integration_tests
|
||||||
|
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd examples/server/tests
|
||||||
PORT=8888 ./tests.sh
|
PORT=8888 ./tests.sh
|
||||||
|
|
||||||
- name: Slow tests
|
- name: Slow tests
|
||||||
id: server_integration_tests_slow
|
id: server_integration_tests_slow
|
||||||
if: ${{ github.event.schedule != '' && matrix.build_type == 'Release' || github.event.inputs.slow_tests == 'true' }}
|
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd examples/server/tests
|
||||||
PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
|
PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
|
||||||
|
@ -124,13 +122,14 @@ jobs:
|
||||||
|
|
||||||
- name: Tests
|
- name: Tests
|
||||||
id: server_integration_tests
|
id: server_integration_tests
|
||||||
|
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd examples/server/tests
|
||||||
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
|
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
|
||||||
|
|
||||||
- name: Slow tests
|
- name: Slow tests
|
||||||
id: server_integration_tests_slow
|
id: server_integration_tests_slow
|
||||||
if: ${{ github.event.schedule != '' || github.event.inputs.slow_tests == 'true' }}
|
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd examples/server/tests
|
||||||
behave.exe --stop --no-skipped --no-capture --tags slow
|
behave.exe --stop --no-skipped --no-capture --tags slow
|
||||||
|
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -13,6 +13,8 @@
|
||||||
.vs/
|
.vs/
|
||||||
.vscode/
|
.vscode/
|
||||||
|
|
||||||
|
ggml-metal-embed.metal
|
||||||
|
|
||||||
lcov-report/
|
lcov-report/
|
||||||
gcovr-report/
|
gcovr-report/
|
||||||
|
|
||||||
|
|
|
@ -1878,3 +1878,16 @@ void llama_embd_normalize(const float * inp, float * out, int n) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n){
|
||||||
|
double sum = 0.0;
|
||||||
|
double sum1 = 0.0;
|
||||||
|
double sum2 = 0.0;
|
||||||
|
|
||||||
|
for (int i = 0; i < n; i++) {
|
||||||
|
sum += embd1[i] * embd2[i];
|
||||||
|
sum1 += embd1[i] * embd1[i];
|
||||||
|
sum2 += embd2[i] * embd2[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
return sum / (sqrt(sum1) * sqrt(sum2));
|
||||||
|
}
|
||||||
|
|
|
@ -282,3 +282,4 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40
|
||||||
|
|
||||||
void llama_embd_normalize(const float * inp, float * out, int n);
|
void llama_embd_normalize(const float * inp, float * out, int n);
|
||||||
|
|
||||||
|
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
||||||
|
|
126
convert.py
126
convert.py
|
@ -332,6 +332,9 @@ class Params:
|
||||||
#
|
#
|
||||||
|
|
||||||
class BpeVocab:
|
class BpeVocab:
|
||||||
|
tokenizer_model = "gpt2"
|
||||||
|
name = "bpe"
|
||||||
|
|
||||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
||||||
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
|
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
|
||||||
if isinstance(self.bpe_tokenizer.get('model'), dict):
|
if isinstance(self.bpe_tokenizer.get('model'), dict):
|
||||||
|
@ -390,6 +393,9 @@ class BpeVocab:
|
||||||
|
|
||||||
|
|
||||||
class SentencePieceVocab:
|
class SentencePieceVocab:
|
||||||
|
tokenizer_model = "llama"
|
||||||
|
name = "spm"
|
||||||
|
|
||||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
||||||
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
|
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
|
||||||
added_tokens: dict[str, int]
|
added_tokens: dict[str, int]
|
||||||
|
@ -453,6 +459,9 @@ class SentencePieceVocab:
|
||||||
|
|
||||||
|
|
||||||
class HfVocab:
|
class HfVocab:
|
||||||
|
tokenizer_model = "llama"
|
||||||
|
name = "hfft"
|
||||||
|
|
||||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
|
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
|
||||||
try:
|
try:
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
@ -553,7 +562,15 @@ class HfVocab:
|
||||||
return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||||
|
|
||||||
|
|
||||||
Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab"
|
class NoVocab:
|
||||||
|
tokenizer_model = "no_vocab"
|
||||||
|
name = "no_vocab"
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return "<NoVocab for a model without integrated vocabulary>"
|
||||||
|
|
||||||
|
|
||||||
|
Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab | NoVocab"
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -935,8 +952,10 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
|
||||||
# Handle special case where the model's vocab size is not set
|
# Handle special case where the model's vocab size is not set
|
||||||
if params.n_vocab == -1:
|
if params.n_vocab == -1:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"The model's vocab size is set to -1 in params.json. Please update it manually. Maybe {vocab.vocab_size}?"
|
f"The model's vocab size is set to -1 in params.json. Please update it manually.{f' Maybe {vocab.vocab_size}?' if hasattr(vocab, 'vocab_size') else ''}"
|
||||||
)
|
)
|
||||||
|
if isinstance(vocab, NoVocab):
|
||||||
|
return # model has no vocab
|
||||||
|
|
||||||
# Check for a vocab size mismatch
|
# Check for a vocab size mismatch
|
||||||
if params.n_vocab == vocab.vocab_size:
|
if params.n_vocab == vocab.vocab_size:
|
||||||
|
@ -977,6 +996,7 @@ class OutputFile:
|
||||||
name = str(params.path_model.parent).split('/')[-1]
|
name = str(params.path_model.parent).split('/')[-1]
|
||||||
|
|
||||||
self.gguf.add_name (name)
|
self.gguf.add_name (name)
|
||||||
|
self.gguf.add_vocab_size (params.n_vocab)
|
||||||
self.gguf.add_context_length (params.n_ctx)
|
self.gguf.add_context_length (params.n_ctx)
|
||||||
self.gguf.add_embedding_length (params.n_embd)
|
self.gguf.add_embedding_length (params.n_embd)
|
||||||
self.gguf.add_block_count (params.n_layer)
|
self.gguf.add_block_count (params.n_layer)
|
||||||
|
@ -1013,21 +1033,9 @@ class OutputFile:
|
||||||
if params.ftype is not None:
|
if params.ftype is not None:
|
||||||
self.gguf.add_file_type(params.ftype)
|
self.gguf.add_file_type(params.ftype)
|
||||||
|
|
||||||
def handle_tokenizer_model(self, vocab: Vocab) -> str:
|
|
||||||
# Map the vocab types to the supported tokenizer models
|
|
||||||
tokenizer_model = {
|
|
||||||
SentencePieceVocab: "llama",
|
|
||||||
HfVocab: "llama",
|
|
||||||
BpeVocab: "gpt2",
|
|
||||||
}.get(type(vocab))
|
|
||||||
|
|
||||||
# Block if vocab type is not predefined
|
|
||||||
if tokenizer_model is None:
|
|
||||||
raise ValueError("Unknown vocab type: Not supported")
|
|
||||||
|
|
||||||
return tokenizer_model
|
|
||||||
|
|
||||||
def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
|
def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
|
||||||
|
assert not isinstance(vocab, NoVocab)
|
||||||
|
|
||||||
tokens = []
|
tokens = []
|
||||||
scores = []
|
scores = []
|
||||||
toktypes = []
|
toktypes = []
|
||||||
|
@ -1043,11 +1051,8 @@ class OutputFile:
|
||||||
return tokens, scores, toktypes
|
return tokens, scores, toktypes
|
||||||
|
|
||||||
def add_meta_vocab(self, vocab: Vocab) -> None:
|
def add_meta_vocab(self, vocab: Vocab) -> None:
|
||||||
# Handle the tokenizer model
|
|
||||||
tokenizer_model = self.handle_tokenizer_model(vocab)
|
|
||||||
|
|
||||||
# Ensure that tokenizer_model is added to the GGUF model
|
# Ensure that tokenizer_model is added to the GGUF model
|
||||||
self.gguf.add_tokenizer_model(tokenizer_model)
|
self.gguf.add_tokenizer_model(vocab.tokenizer_model)
|
||||||
|
|
||||||
# Extract model vocabulary for model conversion
|
# Extract model vocabulary for model conversion
|
||||||
tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab)
|
tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab)
|
||||||
|
@ -1074,6 +1079,26 @@ class OutputFile:
|
||||||
def write_tensor_info(self) -> None:
|
def write_tensor_info(self) -> None:
|
||||||
self.gguf.write_ti_data_to_file()
|
self.gguf.write_ti_data_to_file()
|
||||||
|
|
||||||
|
def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: int) -> None:
|
||||||
|
ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency=concurrency)
|
||||||
|
if ftype == GGMLFileType.MostlyQ8_0:
|
||||||
|
ndarrays = bounded_parallel_map(
|
||||||
|
OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
|
||||||
|
use_processpool_executor=True,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
|
||||||
|
elapsed = time.time() - start
|
||||||
|
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
|
||||||
|
padi = len(str(len(model)))
|
||||||
|
print(
|
||||||
|
f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
|
||||||
|
)
|
||||||
|
self.gguf.write_tensor_data(ndarray)
|
||||||
|
|
||||||
def close(self) -> None:
|
def close(self) -> None:
|
||||||
self.gguf.close()
|
self.gguf.close()
|
||||||
|
|
||||||
|
@ -1082,7 +1107,7 @@ class OutputFile:
|
||||||
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
|
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
|
||||||
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
|
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
check_vocab_size(params, vocab, pad_vocab = pad_vocab)
|
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
|
||||||
|
|
||||||
of = OutputFile(fname_out, endianess=endianess)
|
of = OutputFile(fname_out, endianess=endianess)
|
||||||
|
|
||||||
|
@ -1120,8 +1145,11 @@ class OutputFile:
|
||||||
|
|
||||||
# meta data
|
# meta data
|
||||||
of.add_meta_arch(params)
|
of.add_meta_arch(params)
|
||||||
of.add_meta_vocab(vocab)
|
if isinstance(vocab, NoVocab):
|
||||||
of.add_meta_special_vocab(svocab)
|
of.gguf.add_tokenizer_model(vocab.tokenizer_model)
|
||||||
|
else:
|
||||||
|
of.add_meta_vocab(vocab)
|
||||||
|
of.add_meta_special_vocab(svocab)
|
||||||
|
|
||||||
# tensor info
|
# tensor info
|
||||||
for name, lazy_tensor in model.items():
|
for name, lazy_tensor in model.items():
|
||||||
|
@ -1131,24 +1159,7 @@ class OutputFile:
|
||||||
of.write_tensor_info()
|
of.write_tensor_info()
|
||||||
|
|
||||||
# tensor data
|
# tensor data
|
||||||
ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
|
of.write_tensor_data(ftype, model, concurrency)
|
||||||
if ftype == GGMLFileType.MostlyQ8_0:
|
|
||||||
ndarrays = bounded_parallel_map(
|
|
||||||
OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
|
|
||||||
use_processpool_executor=True,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
|
|
||||||
|
|
||||||
start = time.time()
|
|
||||||
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
|
|
||||||
elapsed = time.time() - start
|
|
||||||
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
|
|
||||||
padi = len(str(len(model)))
|
|
||||||
print(
|
|
||||||
f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
|
|
||||||
)
|
|
||||||
of.gguf.write_tensor_data(ndarray)
|
|
||||||
|
|
||||||
of.close()
|
of.close()
|
||||||
|
|
||||||
|
@ -1309,8 +1320,8 @@ class VocabFactory:
|
||||||
return vtype, path
|
return vtype, path
|
||||||
raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
|
raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
|
||||||
|
|
||||||
def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
|
def _create_special_vocab(self, vocab: Vocab, model_parent_path: Path) -> gguf.SpecialVocab:
|
||||||
load_merges = vocabtype == "bpe"
|
load_merges = vocab.name == "bpe"
|
||||||
n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
|
n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
|
||||||
return gguf.SpecialVocab(
|
return gguf.SpecialVocab(
|
||||||
model_parent_path,
|
model_parent_path,
|
||||||
|
@ -1319,30 +1330,34 @@ class VocabFactory:
|
||||||
n_vocab=n_vocab,
|
n_vocab=n_vocab,
|
||||||
)
|
)
|
||||||
|
|
||||||
def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
|
def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
|
||||||
vocab_type, path = self._select_file(vocab_types)
|
vocab_type, path = self._select_file(vocab_types)
|
||||||
print(f"Loading vocab file {path!r}, type {vocab_type!r}")
|
print(f"Loading vocab file {path!r}, type {vocab_type!r}")
|
||||||
|
|
||||||
added_tokens_path = path.parent / "added_tokens.json"
|
added_tokens_path = path.parent / "added_tokens.json"
|
||||||
vocab: Vocab
|
|
||||||
if vocab_type == "bpe":
|
if vocab_type == "bpe":
|
||||||
vocab = BpeVocab(
|
return BpeVocab(
|
||||||
path, added_tokens_path if added_tokens_path.exists() else None
|
path, added_tokens_path if added_tokens_path.exists() else None
|
||||||
)
|
)
|
||||||
elif vocab_type == "spm":
|
if vocab_type == "spm":
|
||||||
vocab = SentencePieceVocab(
|
return SentencePieceVocab(
|
||||||
path, added_tokens_path if added_tokens_path.exists() else None
|
path, added_tokens_path if added_tokens_path.exists() else None
|
||||||
)
|
)
|
||||||
elif vocab_type == "hfft":
|
if vocab_type == "hfft":
|
||||||
vocab = HfVocab(
|
return HfVocab(
|
||||||
path.parent, added_tokens_path if added_tokens_path.exists() else None
|
path.parent, added_tokens_path if added_tokens_path.exists() else None
|
||||||
)
|
)
|
||||||
|
raise ValueError(vocab_type)
|
||||||
|
|
||||||
|
def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
|
||||||
|
vocab: Vocab
|
||||||
|
if len(vocab_types) == 1 and "no_vocab" in vocab_types:
|
||||||
|
vocab = NoVocab()
|
||||||
else:
|
else:
|
||||||
raise ValueError(vocab_type)
|
vocab = self._create_vocab_by_path(vocab_types)
|
||||||
# FIXME: Respect --vocab-dir?
|
# FIXME: Respect --vocab-dir?
|
||||||
special_vocab = self._create_special_vocab(
|
special_vocab = self._create_special_vocab(
|
||||||
vocab,
|
vocab,
|
||||||
vocab_type,
|
|
||||||
model_parent_path,
|
model_parent_path,
|
||||||
)
|
)
|
||||||
return vocab, special_vocab
|
return vocab, special_vocab
|
||||||
|
@ -1380,6 +1395,7 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
|
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
|
||||||
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
|
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
|
||||||
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
||||||
|
parser.add_argument("--no-vocab", action="store_true", help="store model without the vocab")
|
||||||
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
|
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
|
||||||
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
|
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
|
||||||
parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft")
|
parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft")
|
||||||
|
@ -1392,6 +1408,10 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
|
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
|
||||||
|
|
||||||
args = parser.parse_args(args_in)
|
args = parser.parse_args(args_in)
|
||||||
|
if args.no_vocab:
|
||||||
|
if args.vocab_only:
|
||||||
|
raise ValueError("no need to specify --vocab-only if using --no-vocab")
|
||||||
|
args.vocab_type = "no_vocab"
|
||||||
|
|
||||||
if args.dump_single:
|
if args.dump_single:
|
||||||
model_plus = lazy_load_file(args.model)
|
model_plus = lazy_load_file(args.model)
|
||||||
|
@ -1442,7 +1462,7 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
print(f"Wrote {outfile}")
|
print(f"Wrote {outfile}")
|
||||||
return
|
return
|
||||||
|
|
||||||
if model_plus.vocab is not None and args.vocab_dir is None:
|
if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
|
||||||
vocab = model_plus.vocab
|
vocab = model_plus.vocab
|
||||||
|
|
||||||
print(f"Vocab info: {vocab}")
|
print(f"Vocab info: {vocab}")
|
||||||
|
|
|
@ -113,13 +113,20 @@ int main(int argc, char ** argv) {
|
||||||
// tokenize the prompts and trim
|
// tokenize the prompts and trim
|
||||||
std::vector<std::vector<int32_t>> inputs;
|
std::vector<std::vector<int32_t>> inputs;
|
||||||
for (const auto & prompt : prompts) {
|
for (const auto & prompt : prompts) {
|
||||||
auto inp = ::llama_tokenize(ctx, prompt, true);
|
auto inp = ::llama_tokenize(ctx, prompt, true, false);
|
||||||
if (inp.size() > n_batch) {
|
if (inp.size() > n_batch) {
|
||||||
inp.resize(n_batch);
|
inp.resize(n_batch);
|
||||||
}
|
}
|
||||||
inputs.push_back(inp);
|
inputs.push_back(inp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// add eos if not present
|
||||||
|
for (auto & inp : inputs) {
|
||||||
|
if (inp.empty() || inp.back() != llama_token_eos(model)) {
|
||||||
|
inp.push_back(llama_token_eos(model));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// tokenization stats
|
// tokenization stats
|
||||||
if (params.verbose_prompt) {
|
if (params.verbose_prompt) {
|
||||||
for (int i = 0; i < (int) inputs.size(); i++) {
|
for (int i = 0; i < (int) inputs.size(); i++) {
|
||||||
|
@ -168,15 +175,26 @@ int main(int argc, char ** argv) {
|
||||||
float * out = emb + p * n_embd;
|
float * out = emb + p * n_embd;
|
||||||
batch_decode(ctx, batch, out, s, n_embd);
|
batch_decode(ctx, batch, out, s, n_embd);
|
||||||
|
|
||||||
// print first 3 embeddings
|
// print the first part of the embeddings
|
||||||
for (int j = 0; j < std::min(3, n_prompts); j++) {
|
fprintf(stdout, "\n");
|
||||||
fprintf(stderr, "embedding %d: ", j);
|
for (int j = 0; j < n_prompts; j++) {
|
||||||
for (int i = 0; i < n_embd; i++) {
|
fprintf(stdout, "embedding %d: ", j);
|
||||||
fprintf(stderr, "%f ", emb[j * n_embd + i]);
|
for (int i = 0; i < std::min(16, n_embd); i++) {
|
||||||
|
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
||||||
}
|
}
|
||||||
fprintf(stderr, "\n\n");
|
fprintf(stdout, "\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
// print cosine similarity matrix
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
printf("cosine similarity matrix:\n\n");
|
||||||
|
for (int i = 0; i < n_prompts; i++) {
|
||||||
|
for (int j = 0; j < n_prompts; j++) {
|
||||||
|
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||||
|
fprintf(stdout, "%6.2f ", sim);
|
||||||
|
}
|
||||||
|
fprintf(stdout, "\n");
|
||||||
}
|
}
|
||||||
fprintf(stderr, "\n");
|
|
||||||
|
|
||||||
// clean up
|
// clean up
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
|
|
|
@ -211,6 +211,7 @@ static bool gguf_ex_read_1(const std::string & fname) {
|
||||||
for (int j = 0; j < ggml_nelements(cur); ++j) {
|
for (int j = 0; j < ggml_nelements(cur); ++j) {
|
||||||
if (data[j] != 100 + i) {
|
if (data[j] != 100 + i) {
|
||||||
fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
|
fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
|
||||||
|
gguf_free(ctx);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,22 +6,6 @@
|
||||||
|
|
||||||
// #define GRIT_DEBUG
|
// #define GRIT_DEBUG
|
||||||
|
|
||||||
static float dot_product(const std::vector<float> & v1, const std::vector<float> & v2) {
|
|
||||||
float dot = 0.0f;
|
|
||||||
for (uint64_t i = 0; i < v1.size(); ++i) {
|
|
||||||
dot += v1[i] * v2[i];
|
|
||||||
}
|
|
||||||
return dot;
|
|
||||||
}
|
|
||||||
|
|
||||||
static float norm(const std::vector<float> & v) {
|
|
||||||
return std::sqrt(dot_product(v, v));
|
|
||||||
}
|
|
||||||
|
|
||||||
static float cosine_similarity(const std::vector<float> & v1, const std::vector<float> & v2) {
|
|
||||||
return dot_product(v1, v2) / (norm(v1) * norm(v2));
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
|
static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
|
||||||
std::vector<std::vector<float>> result;
|
std::vector<std::vector<float>> result;
|
||||||
|
|
||||||
|
@ -203,10 +187,12 @@ int main(int argc, char * argv[]) {
|
||||||
const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
|
const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
|
||||||
const std::vector<std::vector<float>> q_rep = encode(ctx, queries, gritlm_instruction(instruction));
|
const std::vector<std::vector<float>> q_rep = encode(ctx, queries, gritlm_instruction(instruction));
|
||||||
|
|
||||||
const float cosine_sim_q0_d0 = cosine_similarity(q_rep[0], d_rep[0]);
|
const int n_embd = llama_n_embd(mdl);
|
||||||
const float cosine_sim_q0_d1 = cosine_similarity(q_rep[0], d_rep[1]);
|
|
||||||
const float cosine_sim_q1_d0 = cosine_similarity(q_rep[1], d_rep[0]);
|
const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
|
||||||
const float cosine_sim_q1_d1 = cosine_similarity(q_rep[1], d_rep[1]);
|
const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
|
||||||
|
const float cosine_sim_q1_d0 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd);
|
||||||
|
const float cosine_sim_q1_d1 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd);
|
||||||
|
|
||||||
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
|
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
|
||||||
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);
|
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);
|
||||||
|
|
|
@ -104,6 +104,7 @@ static std::string get_cpu_info() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
fclose(f);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
// TODO: other platforms
|
// TODO: other platforms
|
||||||
|
|
|
@ -63,12 +63,20 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director
|
||||||
```console
|
```console
|
||||||
git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
|
git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
|
||||||
```
|
```
|
||||||
2) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
|
|
||||||
|
2) Install the required Python packages:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
pip install -r examples/llava/requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
|
||||||
```console
|
```console
|
||||||
python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/
|
python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/
|
||||||
```
|
```
|
||||||
- you will find a llava.projector and a llava.clip file in your model directory
|
- you will find a llava.projector and a llava.clip file in your model directory
|
||||||
3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
|
|
||||||
|
4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
|
||||||
```console
|
```console
|
||||||
mkdir vit
|
mkdir vit
|
||||||
cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin
|
cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin
|
||||||
|
@ -76,18 +84,18 @@ cp ../llava-v1.6-vicuna-7b/llava.projector vit/
|
||||||
curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json
|
curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json
|
||||||
```
|
```
|
||||||
|
|
||||||
4) Create the visual gguf model:
|
5) Create the visual gguf model:
|
||||||
```console
|
```console
|
||||||
python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
|
python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
|
||||||
```
|
```
|
||||||
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
|
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
|
||||||
|
|
||||||
5) Then convert the model to gguf format:
|
6) Then convert the model to gguf format:
|
||||||
```console
|
```console
|
||||||
python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown
|
python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown
|
||||||
```
|
```
|
||||||
|
|
||||||
6) And finally we can run the llava-cli using the 1.6 model version:
|
7) And finally we can run the llava-cli using the 1.6 model version:
|
||||||
```console
|
```console
|
||||||
./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
|
./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
|
||||||
```
|
```
|
||||||
|
|
|
@ -995,6 +995,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
if (!new_clip->ctx_data) {
|
if (!new_clip->ctx_data) {
|
||||||
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
||||||
clip_free(new_clip);
|
clip_free(new_clip);
|
||||||
|
gguf_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1002,6 +1003,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
if (!fin) {
|
if (!fin) {
|
||||||
printf("cannot open model file for loading tensors\n");
|
printf("cannot open model file for loading tensors\n");
|
||||||
clip_free(new_clip);
|
clip_free(new_clip);
|
||||||
|
gguf_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1023,6 +1025,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
if (!fin) {
|
if (!fin) {
|
||||||
printf("%s: failed to seek for tensor %s\n", __func__, name);
|
printf("%s: failed to seek for tensor %s\n", __func__, name);
|
||||||
clip_free(new_clip);
|
clip_free(new_clip);
|
||||||
|
gguf_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
int num_bytes = ggml_nbytes(cur);
|
int num_bytes = ggml_nbytes(cur);
|
||||||
|
@ -1908,6 +1911,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
printf("Please use an input file in f32 or f16\n");
|
printf("Please use an input file in f32 or f16\n");
|
||||||
|
gguf_free(ctx_out);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -119,6 +119,10 @@ def step_server_metrics(context):
|
||||||
def step_start_server(context):
|
def step_start_server(context):
|
||||||
start_server_background(context)
|
start_server_background(context)
|
||||||
attempts = 0
|
attempts = 0
|
||||||
|
max_attempts = 20
|
||||||
|
if 'GITHUB_ACTIONS' in os.environ:
|
||||||
|
max_attempts *= 2
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
|
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
|
||||||
result = sock.connect_ex((context.server_fqdn, context.server_port))
|
result = sock.connect_ex((context.server_fqdn, context.server_port))
|
||||||
|
@ -126,7 +130,7 @@ def step_start_server(context):
|
||||||
print("\x1b[33;46mserver started!\x1b[0m")
|
print("\x1b[33;46mserver started!\x1b[0m")
|
||||||
return
|
return
|
||||||
attempts += 1
|
attempts += 1
|
||||||
if attempts > 20:
|
if attempts > max_attempts:
|
||||||
assert False, "server not started"
|
assert False, "server not started"
|
||||||
print(f"waiting for server to start, connect error code = {result}...")
|
print(f"waiting for server to start, connect error code = {result}...")
|
||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
|
@ -943,6 +947,9 @@ async def wait_for_health_status(context,
|
||||||
print(f"Starting checking for health for expected_health_status={expected_health_status}\n")
|
print(f"Starting checking for health for expected_health_status={expected_health_status}\n")
|
||||||
interval = 0.5
|
interval = 0.5
|
||||||
counter = 0
|
counter = 0
|
||||||
|
if 'GITHUB_ACTIONS' in os.environ:
|
||||||
|
timeout *= 2
|
||||||
|
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
while True:
|
while True:
|
||||||
async with await session.get(f'{base_url}/health', params=params) as health_response:
|
async with await session.get(f'{base_url}/health', params=params) as health_response:
|
||||||
|
|
|
@ -711,6 +711,7 @@ static bool load_checkpoint_file(const char * filename, struct my_llama_model *
|
||||||
|
|
||||||
load_checkpoint_gguf(fctx, f_ggml_ctx, model, train);
|
load_checkpoint_gguf(fctx, f_ggml_ctx, model, train);
|
||||||
|
|
||||||
|
gguf_free(fctx);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
49
ggml-metal.m
49
ggml-metal.m
|
@ -280,6 +280,11 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
id<MTLLibrary> metal_library;
|
id<MTLLibrary> metal_library;
|
||||||
|
|
||||||
// load library
|
// load library
|
||||||
|
//
|
||||||
|
// - first check if the library is embedded
|
||||||
|
// - then check if the library is in the bundle
|
||||||
|
// - if not found, load the source and compile it
|
||||||
|
// - if that fails, return NULL
|
||||||
{
|
{
|
||||||
NSBundle * bundle = nil;
|
NSBundle * bundle = nil;
|
||||||
#ifdef SWIFT_PACKAGE
|
#ifdef SWIFT_PACKAGE
|
||||||
|
@ -287,12 +292,21 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
#else
|
#else
|
||||||
bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
NSError * error = nil;
|
NSError * error = nil;
|
||||||
NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
|
|
||||||
if (libPath != nil) {
|
#if GGML_METAL_EMBED_LIBRARY
|
||||||
|
const bool try_metallib = false;
|
||||||
|
#else
|
||||||
|
const bool try_metallib = true;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
|
||||||
|
if (try_metallib && path_lib != nil) {
|
||||||
// pre-compiled library found
|
// pre-compiled library found
|
||||||
NSURL * libURL = [NSURL fileURLWithPath:libPath];
|
NSURL * libURL = [NSURL fileURLWithPath:path_lib];
|
||||||
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
|
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
|
||||||
|
|
||||||
metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
|
metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
|
||||||
if (error) {
|
if (error) {
|
||||||
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
|
@ -305,31 +319,34 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
extern const char ggml_metallib_start[];
|
extern const char ggml_metallib_start[];
|
||||||
extern const char ggml_metallib_end[];
|
extern const char ggml_metallib_end[];
|
||||||
|
|
||||||
NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
|
NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
|
||||||
#else
|
#else
|
||||||
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
||||||
|
|
||||||
NSString * sourcePath;
|
NSString * path_source;
|
||||||
NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
|
NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
|
||||||
|
|
||||||
GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, ggmlMetalPathResources ? [ggmlMetalPathResources UTF8String] : "nil");
|
GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
|
||||||
|
|
||||||
if (ggmlMetalPathResources) {
|
if (path_resource) {
|
||||||
sourcePath = [ggmlMetalPathResources stringByAppendingPathComponent:@"ggml-metal-merged.metal"];
|
path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal-merged.metal"];
|
||||||
} else {
|
} else {
|
||||||
sourcePath = [bundle pathForResource:@"ggml-metal-merged" ofType:@"metal"];
|
path_source = [bundle pathForResource:@"ggml-metal-merged" ofType:@"metal"];
|
||||||
}
|
}
|
||||||
if (sourcePath == nil) {
|
|
||||||
|
if (path_source == nil) {
|
||||||
GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal-merged.metal, falling back to trying cwd\n", __func__);
|
GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal-merged.metal, falling back to trying cwd\n", __func__);
|
||||||
sourcePath = @"ggml-metal-merged.metal";
|
path_source = @"ggml-metal.metal";
|
||||||
}
|
}
|
||||||
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
|
|
||||||
NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
|
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
|
||||||
|
|
||||||
|
NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
|
||||||
if (error) {
|
if (error) {
|
||||||
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
#endif
|
#endif // GGML_METAL_EMBED_LIBRARY
|
||||||
|
|
||||||
@autoreleasepool {
|
@autoreleasepool {
|
||||||
// dictionary of preprocessor macros
|
// dictionary of preprocessor macros
|
||||||
|
|
64
ggml.h
64
ggml.h
|
@ -344,24 +344,24 @@ extern "C" {
|
||||||
struct ggml_object;
|
struct ggml_object;
|
||||||
struct ggml_context;
|
struct ggml_context;
|
||||||
|
|
||||||
|
// NOTE: always add types at the end of the enum to keep backward compatibility
|
||||||
enum ggml_type {
|
enum ggml_type {
|
||||||
GGML_TYPE_F32 = 0,
|
GGML_TYPE_F32 = 0,
|
||||||
GGML_TYPE_F16 = 1,
|
GGML_TYPE_F16 = 1,
|
||||||
GGML_TYPE_Q4_0 = 2,
|
GGML_TYPE_Q4_0 = 2,
|
||||||
GGML_TYPE_Q4_1 = 3,
|
GGML_TYPE_Q4_1 = 3,
|
||||||
// GGML_TYPE_Q4_2 = 4, support has been removed
|
// GGML_TYPE_Q4_2 = 4, support has been removed
|
||||||
// GGML_TYPE_Q4_3 (5) support has been removed
|
// GGML_TYPE_Q4_3 = 5, support has been removed
|
||||||
GGML_TYPE_Q5_0 = 6,
|
GGML_TYPE_Q5_0 = 6,
|
||||||
GGML_TYPE_Q5_1 = 7,
|
GGML_TYPE_Q5_1 = 7,
|
||||||
GGML_TYPE_Q8_0 = 8,
|
GGML_TYPE_Q8_0 = 8,
|
||||||
GGML_TYPE_Q8_1 = 9,
|
GGML_TYPE_Q8_1 = 9,
|
||||||
// k-quantizations
|
GGML_TYPE_Q2_K = 10,
|
||||||
GGML_TYPE_Q2_K = 10,
|
GGML_TYPE_Q3_K = 11,
|
||||||
GGML_TYPE_Q3_K = 11,
|
GGML_TYPE_Q4_K = 12,
|
||||||
GGML_TYPE_Q4_K = 12,
|
GGML_TYPE_Q5_K = 13,
|
||||||
GGML_TYPE_Q5_K = 13,
|
GGML_TYPE_Q6_K = 14,
|
||||||
GGML_TYPE_Q6_K = 14,
|
GGML_TYPE_Q8_K = 15,
|
||||||
GGML_TYPE_Q8_K = 15,
|
|
||||||
GGML_TYPE_IQ2_XXS = 16,
|
GGML_TYPE_IQ2_XXS = 16,
|
||||||
GGML_TYPE_IQ2_XS = 17,
|
GGML_TYPE_IQ2_XS = 17,
|
||||||
GGML_TYPE_IQ3_XXS = 18,
|
GGML_TYPE_IQ3_XXS = 18,
|
||||||
|
@ -370,9 +370,9 @@ extern "C" {
|
||||||
GGML_TYPE_IQ3_S = 21,
|
GGML_TYPE_IQ3_S = 21,
|
||||||
GGML_TYPE_IQ2_S = 22,
|
GGML_TYPE_IQ2_S = 22,
|
||||||
GGML_TYPE_IQ4_XS = 23,
|
GGML_TYPE_IQ4_XS = 23,
|
||||||
GGML_TYPE_I8,
|
GGML_TYPE_I8 = 24,
|
||||||
GGML_TYPE_I16,
|
GGML_TYPE_I16 = 25,
|
||||||
GGML_TYPE_I32,
|
GGML_TYPE_I32 = 26,
|
||||||
GGML_TYPE_COUNT,
|
GGML_TYPE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -390,20 +390,20 @@ extern "C" {
|
||||||
|
|
||||||
// model file types
|
// model file types
|
||||||
enum ggml_ftype {
|
enum ggml_ftype {
|
||||||
GGML_FTYPE_UNKNOWN = -1,
|
GGML_FTYPE_UNKNOWN = -1,
|
||||||
GGML_FTYPE_ALL_F32 = 0,
|
GGML_FTYPE_ALL_F32 = 0,
|
||||||
GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||||
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
|
GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
|
GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
|
GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
|
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
|
||||||
|
|
|
@ -32,6 +32,7 @@ class Keys:
|
||||||
FILE_TYPE = "general.file_type"
|
FILE_TYPE = "general.file_type"
|
||||||
|
|
||||||
class LLM:
|
class LLM:
|
||||||
|
VOCAB_SIZE = "{arch}.vocab_size"
|
||||||
CONTEXT_LENGTH = "{arch}.context_length"
|
CONTEXT_LENGTH = "{arch}.context_length"
|
||||||
EMBEDDING_LENGTH = "{arch}.embedding_length"
|
EMBEDDING_LENGTH = "{arch}.embedding_length"
|
||||||
BLOCK_COUNT = "{arch}.block_count"
|
BLOCK_COUNT = "{arch}.block_count"
|
||||||
|
@ -661,6 +662,9 @@ class GGMLQuantizationType(IntEnum):
|
||||||
IQ3_S = 21
|
IQ3_S = 21
|
||||||
IQ2_S = 22
|
IQ2_S = 22
|
||||||
IQ4_XS = 23
|
IQ4_XS = 23
|
||||||
|
I8 = 24
|
||||||
|
I16 = 25
|
||||||
|
I32 = 26
|
||||||
|
|
||||||
|
|
||||||
class GGUFEndian(IntEnum):
|
class GGUFEndian(IntEnum):
|
||||||
|
@ -727,6 +731,9 @@ GGML_QUANT_SIZES = {
|
||||||
GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4),
|
GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4),
|
||||||
GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16),
|
GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16),
|
||||||
GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64),
|
GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64),
|
||||||
|
GGMLQuantizationType.I8: (1, 1),
|
||||||
|
GGMLQuantizationType.I16: (1, 2),
|
||||||
|
GGMLQuantizationType.I32: (1, 4),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -746,6 +753,7 @@ KEY_GENERAL_SOURCE_HF_REPO = Keys.General.SOURCE_HF_REPO
|
||||||
KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE
|
KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE
|
||||||
|
|
||||||
# LLM
|
# LLM
|
||||||
|
KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE
|
||||||
KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH
|
KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH
|
||||||
KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH
|
KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH
|
||||||
KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT
|
KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT
|
||||||
|
|
|
@ -248,6 +248,15 @@ class GGUFReader:
|
||||||
elif ggml_type == GGMLQuantizationType.F16:
|
elif ggml_type == GGMLQuantizationType.F16:
|
||||||
item_count = n_elems
|
item_count = n_elems
|
||||||
item_type = np.float16
|
item_type = np.float16
|
||||||
|
elif ggml_type == GGMLQuantizationType.I8:
|
||||||
|
item_count = n_elems
|
||||||
|
item_type = np.int8
|
||||||
|
elif ggml_type == GGMLQuantizationType.I16:
|
||||||
|
item_count = n_elems
|
||||||
|
item_type = np.int16
|
||||||
|
elif ggml_type == GGMLQuantizationType.I32:
|
||||||
|
item_count = n_elems
|
||||||
|
item_type = np.int32
|
||||||
else:
|
else:
|
||||||
item_count = n_bytes
|
item_count = n_bytes
|
||||||
item_type = np.uint8
|
item_type = np.uint8
|
||||||
|
|
|
@ -196,9 +196,6 @@ class GGUFWriter:
|
||||||
if self.state is not WriterState.EMPTY:
|
if self.state is not WriterState.EMPTY:
|
||||||
raise ValueError(f'Expected output file to be empty, got {self.state}')
|
raise ValueError(f'Expected output file to be empty, got {self.state}')
|
||||||
|
|
||||||
if raw_dtype is None and tensor_dtype not in (np.float32, np.float16):
|
|
||||||
raise ValueError("Only F32 and F16 tensors are supported for now")
|
|
||||||
|
|
||||||
encoded_name = name.encode("utf8")
|
encoded_name = name.encode("utf8")
|
||||||
self.ti_data += self._pack("Q", len(encoded_name))
|
self.ti_data += self._pack("Q", len(encoded_name))
|
||||||
self.ti_data += encoded_name
|
self.ti_data += encoded_name
|
||||||
|
@ -207,7 +204,18 @@ class GGUFWriter:
|
||||||
for i in range(n_dims):
|
for i in range(n_dims):
|
||||||
self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
|
self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
|
||||||
if raw_dtype is None:
|
if raw_dtype is None:
|
||||||
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
|
if tensor_dtype == np.float32:
|
||||||
|
dtype = GGMLQuantizationType.F32
|
||||||
|
elif tensor_dtype == np.float16:
|
||||||
|
dtype = GGMLQuantizationType.F16
|
||||||
|
elif tensor_dtype == np.int8:
|
||||||
|
dtype = GGMLQuantizationType.I8
|
||||||
|
elif tensor_dtype == np.int16:
|
||||||
|
dtype = GGMLQuantizationType.I16
|
||||||
|
elif tensor_dtype == np.int32:
|
||||||
|
dtype = GGMLQuantizationType.I32
|
||||||
|
else:
|
||||||
|
raise ValueError("Only F32, F16, I8, I16, I32 tensors are supported for now")
|
||||||
else:
|
else:
|
||||||
dtype = raw_dtype
|
dtype = raw_dtype
|
||||||
self.ti_data += self._pack("I", dtype)
|
self.ti_data += self._pack("I", dtype)
|
||||||
|
@ -313,6 +321,9 @@ class GGUFWriter:
|
||||||
self.data_alignment = alignment
|
self.data_alignment = alignment
|
||||||
self.add_uint32(Keys.General.ALIGNMENT, alignment)
|
self.add_uint32(Keys.General.ALIGNMENT, alignment)
|
||||||
|
|
||||||
|
def add_vocab_size(self, size: int) -> None:
|
||||||
|
self.add_uint32(Keys.LLM.VOCAB_SIZE.format(arch=self.arch), size)
|
||||||
|
|
||||||
def add_context_length(self, length: int) -> None:
|
def add_context_length(self, length: int) -> None:
|
||||||
self.add_uint32(Keys.LLM.CONTEXT_LENGTH.format(arch=self.arch), length)
|
self.add_uint32(Keys.LLM.CONTEXT_LENGTH.format(arch=self.arch), length)
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "gguf"
|
name = "gguf"
|
||||||
version = "0.7.0"
|
version = "0.8.0"
|
||||||
description = "Read and write ML models in GGUF for GGML"
|
description = "Read and write ML models in GGUF for GGML"
|
||||||
authors = ["GGML <ggml@ggml.ai>"]
|
authors = ["GGML <ggml@ggml.ai>"]
|
||||||
packages = [
|
packages = [
|
||||||
|
|
|
@ -616,7 +616,7 @@ maxhordelen = 256
|
||||||
modelbusy = threading.Lock()
|
modelbusy = threading.Lock()
|
||||||
requestsinqueue = 0
|
requestsinqueue = 0
|
||||||
defaultport = 5001
|
defaultport = 5001
|
||||||
KcppVersion = "1.61.2"
|
KcppVersion = "1.62"
|
||||||
showdebug = True
|
showdebug = True
|
||||||
showsamplerwarning = True
|
showsamplerwarning = True
|
||||||
showmaxctxwarning = True
|
showmaxctxwarning = True
|
||||||
|
|
126
llama.cpp
126
llama.cpp
|
@ -282,6 +282,7 @@ enum llm_kv {
|
||||||
LLM_KV_GENERAL_SOURCE_URL,
|
LLM_KV_GENERAL_SOURCE_URL,
|
||||||
LLM_KV_GENERAL_SOURCE_HF_REPO,
|
LLM_KV_GENERAL_SOURCE_HF_REPO,
|
||||||
|
|
||||||
|
LLM_KV_VOCAB_SIZE,
|
||||||
LLM_KV_CONTEXT_LENGTH,
|
LLM_KV_CONTEXT_LENGTH,
|
||||||
LLM_KV_EMBEDDING_LENGTH,
|
LLM_KV_EMBEDDING_LENGTH,
|
||||||
LLM_KV_BLOCK_COUNT,
|
LLM_KV_BLOCK_COUNT,
|
||||||
|
@ -345,6 +346,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
||||||
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
||||||
|
|
||||||
|
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
||||||
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
||||||
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
||||||
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
||||||
|
@ -3288,10 +3290,11 @@ static const char * llama_model_type_name(e_model type) {
|
||||||
|
|
||||||
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
|
||||||
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
||||||
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
||||||
default: return "unknown";
|
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
||||||
|
default: return "unknown";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3323,14 +3326,14 @@ static void llm_load_hparams(
|
||||||
ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
|
ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
|
||||||
|
|
||||||
// get hparams kv
|
// get hparams kv
|
||||||
ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
|
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
|
||||||
ml.get_key (LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
||||||
ml.get_key (LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
||||||
ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
|
ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
|
||||||
ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
|
ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
|
||||||
ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||||
ml.get_key (LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
||||||
ml.get_key (LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
||||||
|
|
||||||
GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
|
GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
|
||||||
GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
|
GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
|
||||||
|
@ -3692,30 +3695,25 @@ static void llm_load_vocab(
|
||||||
|
|
||||||
const auto kv = LLM_KV(model.arch);
|
const auto kv = LLM_KV(model.arch);
|
||||||
|
|
||||||
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
|
|
||||||
if (token_idx == -1) {
|
|
||||||
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
const float * scores = nullptr;
|
|
||||||
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
|
||||||
if (score_idx != -1) {
|
|
||||||
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
|
||||||
}
|
|
||||||
|
|
||||||
const int * toktypes = nullptr;
|
|
||||||
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
|
||||||
if (toktype_idx != -1) {
|
|
||||||
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
|
||||||
}
|
|
||||||
|
|
||||||
// determine vocab type
|
// determine vocab type
|
||||||
{
|
{
|
||||||
std::string tokenizer_name;
|
std::string tokenizer_name;
|
||||||
|
|
||||||
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
|
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
|
||||||
|
|
||||||
if (tokenizer_name == "llama") {
|
if (tokenizer_name == "no_vocab") {
|
||||||
|
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
||||||
|
|
||||||
|
// default special tokens
|
||||||
|
vocab.special_bos_id = -1;
|
||||||
|
vocab.special_eos_id = -1;
|
||||||
|
vocab.special_unk_id = -1;
|
||||||
|
vocab.special_sep_id = -1;
|
||||||
|
vocab.special_pad_id = -1;
|
||||||
|
vocab.linefeed_id = -1;
|
||||||
|
|
||||||
|
return;
|
||||||
|
} else if (tokenizer_name == "llama") {
|
||||||
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
||||||
|
|
||||||
// default special tokens
|
// default special tokens
|
||||||
|
@ -3790,6 +3788,23 @@ static void llm_load_vocab(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
|
||||||
|
if (token_idx == -1) {
|
||||||
|
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
const float * scores = nullptr;
|
||||||
|
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
||||||
|
if (score_idx != -1) {
|
||||||
|
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
const int * toktypes = nullptr;
|
||||||
|
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
||||||
|
if (toktype_idx != -1) {
|
||||||
|
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
||||||
|
}
|
||||||
|
|
||||||
const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
|
const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
|
||||||
|
|
||||||
vocab.id_to_token.resize(n_vocab);
|
vocab.id_to_token.resize(n_vocab);
|
||||||
|
@ -3997,7 +4012,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||||
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
||||||
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
||||||
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
||||||
LLAMA_LOG_INFO("%s: causal attm = %d\n", __func__, hparams.causal_attn);
|
LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
|
||||||
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
||||||
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
||||||
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
||||||
|
@ -5095,7 +5110,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||||
|
|
||||||
llm_load_print_meta(ml, model);
|
llm_load_print_meta(ml, model);
|
||||||
|
|
||||||
if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
|
||||||
|
model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
||||||
throw std::runtime_error("vocab size mismatch");
|
throw std::runtime_error("vocab size mismatch");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9108,8 +9124,8 @@ static int llama_decode_internal(
|
||||||
//llama_synchronize(&lctx);
|
//llama_synchronize(&lctx);
|
||||||
|
|
||||||
// decide if we need to defrag the kv cache
|
// decide if we need to defrag the kv cache
|
||||||
if (cparams.defrag_thold >= 0.0f) {
|
if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
|
||||||
const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens_all)/float(kv_self.n) : 0.0f;
|
const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
|
||||||
|
|
||||||
// queue defragmentation for next llama_kv_cache_update
|
// queue defragmentation for next llama_kv_cache_update
|
||||||
if (fragmentation > cparams.defrag_thold) {
|
if (fragmentation > cparams.defrag_thold) {
|
||||||
|
@ -9141,6 +9157,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||||
// number of cells moved
|
// number of cells moved
|
||||||
uint32_t n_moves = 0;
|
uint32_t n_moves = 0;
|
||||||
|
|
||||||
|
// each move requires 6*n_layer tensors (see build_defrag)
|
||||||
|
// - source view, destination view, copy operation
|
||||||
|
// - x2 for keys and values
|
||||||
|
const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
|
||||||
|
|
||||||
// determine which KV cells to move where
|
// determine which KV cells to move where
|
||||||
//
|
//
|
||||||
// cell i moves to ids[i]
|
// cell i moves to ids[i]
|
||||||
|
@ -9167,15 +9188,6 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||||
nh++;
|
nh++;
|
||||||
}
|
}
|
||||||
|
|
||||||
// each move requires 6*n_layer tensors (see build_defrag)
|
|
||||||
// - source view, destination view, copy operation
|
|
||||||
// - x2 for keys and values
|
|
||||||
//
|
|
||||||
if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
|
|
||||||
// the graph is too big, we cannot move more cells
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t nf = 0;
|
uint32_t nf = 0;
|
||||||
uint32_t is = n_kv - 1;
|
uint32_t is = n_kv - 1;
|
||||||
|
|
||||||
|
@ -9205,11 +9217,19 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||||
// are we moving a continuous block of memory?
|
// are we moving a continuous block of memory?
|
||||||
bool cont = false;
|
bool cont = false;
|
||||||
|
|
||||||
|
// should we stop searching for the next move?
|
||||||
|
bool stop = false;
|
||||||
|
|
||||||
// go back and move the nf cells to the hole
|
// go back and move the nf cells to the hole
|
||||||
for (; i1 < n_kv; ++i1) {
|
for (; i1 < n_kv; ++i1) {
|
||||||
auto & cell1 = kv_self.cells[i1];
|
auto & cell1 = kv_self.cells[i1];
|
||||||
|
|
||||||
if (cell1.is_empty() || ids[i1] != n_kv) {
|
if (cell1.is_empty() || ids[i1] != n_kv) {
|
||||||
|
if (n_moves == max_moves) {
|
||||||
|
stop = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
cont = false;
|
cont = false;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -9236,6 +9256,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (stop || n_moves == max_moves) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
|
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
|
||||||
|
|
||||||
i0 += nh - 1;
|
i0 += nh - 1;
|
||||||
|
@ -9425,26 +9449,32 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
|
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
|
||||||
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||||
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
|
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
|
||||||
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||||
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
|
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
|
||||||
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||||
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
||||||
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||||
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
|
static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
|
||||||
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||||
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
||||||
|
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
|
||||||
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
||||||
const auto& token_data = vocab.id_to_token.at(id);
|
const auto& token_data = vocab.id_to_token.at(id);
|
||||||
switch (llama_vocab_get_type(vocab)) {
|
switch (llama_vocab_get_type(vocab)) {
|
||||||
|
@ -9466,6 +9496,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
||||||
|
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
|
||||||
static const char * hex = "0123456789ABCDEF";
|
static const char * hex = "0123456789ABCDEF";
|
||||||
switch (llama_vocab_get_type(vocab)) {
|
switch (llama_vocab_get_type(vocab)) {
|
||||||
case LLAMA_VOCAB_TYPE_SPM: {
|
case LLAMA_VOCAB_TYPE_SPM: {
|
||||||
|
@ -10527,6 +10558,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLAMA_VOCAB_TYPE_NONE:
|
||||||
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
return output;
|
return output;
|
||||||
|
@ -12261,7 +12294,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||||
return new_type;
|
return new_type;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
||||||
std::mutex mutex;
|
std::mutex mutex;
|
||||||
int counter = 0;
|
int counter = 0;
|
||||||
size_t new_size = 0;
|
size_t new_size = 0;
|
||||||
|
@ -13437,7 +13470,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t llama_n_vocab(const struct llama_model * model) {
|
int32_t llama_n_vocab(const struct llama_model * model) {
|
||||||
return model->vocab.id_to_token.size();
|
return model->hparams.n_vocab;
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t llama_n_ctx_train(const struct llama_model * model) {
|
int32_t llama_n_ctx_train(const struct llama_model * model) {
|
||||||
|
@ -14271,14 +14304,17 @@ float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
|
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
|
||||||
|
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||||
return model->vocab.id_to_token[token].text.c_str();
|
return model->vocab.id_to_token[token].text.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
float llama_token_get_score(const struct llama_model * model, llama_token token) {
|
float llama_token_get_score(const struct llama_model * model, llama_token token) {
|
||||||
|
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||||
return model->vocab.id_to_token[token].score;
|
return model->vocab.id_to_token[token].score;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
|
llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
|
||||||
|
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||||
return model->vocab.id_to_token[token].type;
|
return model->vocab.id_to_token[token].type;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
7
llama.h
7
llama.h
|
@ -59,9 +59,10 @@ extern "C" {
|
||||||
typedef int32_t llama_seq_id;
|
typedef int32_t llama_seq_id;
|
||||||
|
|
||||||
enum llama_vocab_type {
|
enum llama_vocab_type {
|
||||||
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
|
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
||||||
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
|
LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece
|
||||||
LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
|
LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding
|
||||||
|
LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
|
||||||
};
|
};
|
||||||
|
|
||||||
// note: these values should be synchronized with ggml_rope
|
// note: these values should be synchronized with ggml_rope
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue