Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.github/workflows/build.yml
#	CMakeLists.txt
#	Makefile
#	ggml-metal.m
This commit is contained in:
Concedo 2024-03-15 10:37:48 +08:00
commit 93d3871056
22 changed files with 341 additions and 198 deletions

View file

@ -25,17 +25,14 @@ jobs:
strategy: strategy:
matrix: matrix:
sanitizer: [ADDRESS, THREAD, UNDEFINED] sanitizer: [ADDRESS, THREAD, UNDEFINED]
build_type: [Debug, Release] build_type: [Debug]
include: include:
- build_type: Release - build_type: Release
sanitizer: "" sanitizer: ""
exclude: - build_type: Debug
- build_type: Release
sanitizer: ADDRESS
- build_type: Release
sanitizer: THREAD sanitizer: THREAD
- build_type: Release disabled_on_pr: true
sanitizer: UNDEFINED fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
container: container:
image: ubuntu:latest image: ubuntu:latest
@ -81,13 +78,14 @@ jobs:
- name: Tests - name: Tests
id: server_integration_tests id: server_integration_tests
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
run: | run: |
cd examples/server/tests cd examples/server/tests
PORT=8888 ./tests.sh PORT=8888 ./tests.sh
- name: Slow tests - name: Slow tests
id: server_integration_tests_slow id: server_integration_tests_slow
if: ${{ github.event.schedule != '' && matrix.build_type == 'Release' || github.event.inputs.slow_tests == 'true' }} if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
run: | run: |
cd examples/server/tests cd examples/server/tests
PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
@ -124,13 +122,14 @@ jobs:
- name: Tests - name: Tests
id: server_integration_tests id: server_integration_tests
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
run: | run: |
cd examples/server/tests cd examples/server/tests
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
- name: Slow tests - name: Slow tests
id: server_integration_tests_slow id: server_integration_tests_slow
if: ${{ github.event.schedule != '' || github.event.inputs.slow_tests == 'true' }} if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
run: | run: |
cd examples/server/tests cd examples/server/tests
behave.exe --stop --no-skipped --no-capture --tags slow behave.exe --stop --no-skipped --no-capture --tags slow

2
.gitignore vendored
View file

@ -13,6 +13,8 @@
.vs/ .vs/
.vscode/ .vscode/
ggml-metal-embed.metal
lcov-report/ lcov-report/
gcovr-report/ gcovr-report/

View file

@ -1878,3 +1878,16 @@ void llama_embd_normalize(const float * inp, float * out, int n) {
} }
} }
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n){
double sum = 0.0;
double sum1 = 0.0;
double sum2 = 0.0;
for (int i = 0; i < n; i++) {
sum += embd1[i] * embd2[i];
sum1 += embd1[i] * embd1[i];
sum2 += embd2[i] * embd2[i];
}
return sum / (sqrt(sum1) * sqrt(sum2));
}

View file

@ -282,3 +282,4 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40
void llama_embd_normalize(const float * inp, float * out, int n); void llama_embd_normalize(const float * inp, float * out, int n);
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);

View file

@ -332,6 +332,9 @@ class Params:
# #
class BpeVocab: class BpeVocab:
tokenizer_model = "gpt2"
name = "bpe"
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
if isinstance(self.bpe_tokenizer.get('model'), dict): if isinstance(self.bpe_tokenizer.get('model'), dict):
@ -390,6 +393,9 @@ class BpeVocab:
class SentencePieceVocab: class SentencePieceVocab:
tokenizer_model = "llama"
name = "spm"
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
added_tokens: dict[str, int] added_tokens: dict[str, int]
@ -453,6 +459,9 @@ class SentencePieceVocab:
class HfVocab: class HfVocab:
tokenizer_model = "llama"
name = "hfft"
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None: def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
try: try:
from transformers import AutoTokenizer from transformers import AutoTokenizer
@ -553,7 +562,15 @@ class HfVocab:
return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>" return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab" class NoVocab:
tokenizer_model = "no_vocab"
name = "no_vocab"
def __repr__(self) -> str:
return "<NoVocab for a model without integrated vocabulary>"
Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab | NoVocab"
# #
@ -935,8 +952,10 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
# Handle special case where the model's vocab size is not set # Handle special case where the model's vocab size is not set
if params.n_vocab == -1: if params.n_vocab == -1:
raise ValueError( raise ValueError(
f"The model's vocab size is set to -1 in params.json. Please update it manually. Maybe {vocab.vocab_size}?" f"The model's vocab size is set to -1 in params.json. Please update it manually.{f' Maybe {vocab.vocab_size}?' if hasattr(vocab, 'vocab_size') else ''}"
) )
if isinstance(vocab, NoVocab):
return # model has no vocab
# Check for a vocab size mismatch # Check for a vocab size mismatch
if params.n_vocab == vocab.vocab_size: if params.n_vocab == vocab.vocab_size:
@ -977,6 +996,7 @@ class OutputFile:
name = str(params.path_model.parent).split('/')[-1] name = str(params.path_model.parent).split('/')[-1]
self.gguf.add_name (name) self.gguf.add_name (name)
self.gguf.add_vocab_size (params.n_vocab)
self.gguf.add_context_length (params.n_ctx) self.gguf.add_context_length (params.n_ctx)
self.gguf.add_embedding_length (params.n_embd) self.gguf.add_embedding_length (params.n_embd)
self.gguf.add_block_count (params.n_layer) self.gguf.add_block_count (params.n_layer)
@ -1013,21 +1033,9 @@ class OutputFile:
if params.ftype is not None: if params.ftype is not None:
self.gguf.add_file_type(params.ftype) self.gguf.add_file_type(params.ftype)
def handle_tokenizer_model(self, vocab: Vocab) -> str:
# Map the vocab types to the supported tokenizer models
tokenizer_model = {
SentencePieceVocab: "llama",
HfVocab: "llama",
BpeVocab: "gpt2",
}.get(type(vocab))
# Block if vocab type is not predefined
if tokenizer_model is None:
raise ValueError("Unknown vocab type: Not supported")
return tokenizer_model
def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]: def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
assert not isinstance(vocab, NoVocab)
tokens = [] tokens = []
scores = [] scores = []
toktypes = [] toktypes = []
@ -1043,11 +1051,8 @@ class OutputFile:
return tokens, scores, toktypes return tokens, scores, toktypes
def add_meta_vocab(self, vocab: Vocab) -> None: def add_meta_vocab(self, vocab: Vocab) -> None:
# Handle the tokenizer model
tokenizer_model = self.handle_tokenizer_model(vocab)
# Ensure that tokenizer_model is added to the GGUF model # Ensure that tokenizer_model is added to the GGUF model
self.gguf.add_tokenizer_model(tokenizer_model) self.gguf.add_tokenizer_model(vocab.tokenizer_model)
# Extract model vocabulary for model conversion # Extract model vocabulary for model conversion
tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab) tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab)
@ -1074,6 +1079,26 @@ class OutputFile:
def write_tensor_info(self) -> None: def write_tensor_info(self) -> None:
self.gguf.write_ti_data_to_file() self.gguf.write_ti_data_to_file()
def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: int) -> None:
ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency=concurrency)
if ftype == GGMLFileType.MostlyQ8_0:
ndarrays = bounded_parallel_map(
OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
use_processpool_executor=True,
)
else:
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
start = time.time()
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
elapsed = time.time() - start
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
padi = len(str(len(model)))
print(
f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
)
self.gguf.write_tensor_data(ndarray)
def close(self) -> None: def close(self) -> None:
self.gguf.close() self.gguf.close()
@ -1082,7 +1107,7 @@ class OutputFile:
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
) -> None: ) -> None:
check_vocab_size(params, vocab, pad_vocab = pad_vocab) check_vocab_size(params, vocab, pad_vocab=pad_vocab)
of = OutputFile(fname_out, endianess=endianess) of = OutputFile(fname_out, endianess=endianess)
@ -1120,8 +1145,11 @@ class OutputFile:
# meta data # meta data
of.add_meta_arch(params) of.add_meta_arch(params)
of.add_meta_vocab(vocab) if isinstance(vocab, NoVocab):
of.add_meta_special_vocab(svocab) of.gguf.add_tokenizer_model(vocab.tokenizer_model)
else:
of.add_meta_vocab(vocab)
of.add_meta_special_vocab(svocab)
# tensor info # tensor info
for name, lazy_tensor in model.items(): for name, lazy_tensor in model.items():
@ -1131,24 +1159,7 @@ class OutputFile:
of.write_tensor_info() of.write_tensor_info()
# tensor data # tensor data
ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency) of.write_tensor_data(ftype, model, concurrency)
if ftype == GGMLFileType.MostlyQ8_0:
ndarrays = bounded_parallel_map(
OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
use_processpool_executor=True,
)
else:
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
start = time.time()
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
elapsed = time.time() - start
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
padi = len(str(len(model)))
print(
f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
)
of.gguf.write_tensor_data(ndarray)
of.close() of.close()
@ -1309,8 +1320,8 @@ class VocabFactory:
return vtype, path return vtype, path
raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}") raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab: def _create_special_vocab(self, vocab: Vocab, model_parent_path: Path) -> gguf.SpecialVocab:
load_merges = vocabtype == "bpe" load_merges = vocab.name == "bpe"
n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
return gguf.SpecialVocab( return gguf.SpecialVocab(
model_parent_path, model_parent_path,
@ -1319,30 +1330,34 @@ class VocabFactory:
n_vocab=n_vocab, n_vocab=n_vocab,
) )
def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]: def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
vocab_type, path = self._select_file(vocab_types) vocab_type, path = self._select_file(vocab_types)
print(f"Loading vocab file {path!r}, type {vocab_type!r}") print(f"Loading vocab file {path!r}, type {vocab_type!r}")
added_tokens_path = path.parent / "added_tokens.json" added_tokens_path = path.parent / "added_tokens.json"
vocab: Vocab
if vocab_type == "bpe": if vocab_type == "bpe":
vocab = BpeVocab( return BpeVocab(
path, added_tokens_path if added_tokens_path.exists() else None path, added_tokens_path if added_tokens_path.exists() else None
) )
elif vocab_type == "spm": if vocab_type == "spm":
vocab = SentencePieceVocab( return SentencePieceVocab(
path, added_tokens_path if added_tokens_path.exists() else None path, added_tokens_path if added_tokens_path.exists() else None
) )
elif vocab_type == "hfft": if vocab_type == "hfft":
vocab = HfVocab( return HfVocab(
path.parent, added_tokens_path if added_tokens_path.exists() else None path.parent, added_tokens_path if added_tokens_path.exists() else None
) )
raise ValueError(vocab_type)
def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
vocab: Vocab
if len(vocab_types) == 1 and "no_vocab" in vocab_types:
vocab = NoVocab()
else: else:
raise ValueError(vocab_type) vocab = self._create_vocab_by_path(vocab_types)
# FIXME: Respect --vocab-dir? # FIXME: Respect --vocab-dir?
special_vocab = self._create_special_vocab( special_vocab = self._create_special_vocab(
vocab, vocab,
vocab_type,
model_parent_path, model_parent_path,
) )
return vocab, special_vocab return vocab, special_vocab
@ -1380,6 +1395,7 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--no-vocab", action="store_true", help="store model without the vocab")
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft") parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft")
@ -1392,6 +1408,10 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing") parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
args = parser.parse_args(args_in) args = parser.parse_args(args_in)
if args.no_vocab:
if args.vocab_only:
raise ValueError("no need to specify --vocab-only if using --no-vocab")
args.vocab_type = "no_vocab"
if args.dump_single: if args.dump_single:
model_plus = lazy_load_file(args.model) model_plus = lazy_load_file(args.model)
@ -1442,7 +1462,7 @@ def main(args_in: list[str] | None = None) -> None:
print(f"Wrote {outfile}") print(f"Wrote {outfile}")
return return
if model_plus.vocab is not None and args.vocab_dir is None: if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
vocab = model_plus.vocab vocab = model_plus.vocab
print(f"Vocab info: {vocab}") print(f"Vocab info: {vocab}")

View file

@ -113,13 +113,20 @@ int main(int argc, char ** argv) {
// tokenize the prompts and trim // tokenize the prompts and trim
std::vector<std::vector<int32_t>> inputs; std::vector<std::vector<int32_t>> inputs;
for (const auto & prompt : prompts) { for (const auto & prompt : prompts) {
auto inp = ::llama_tokenize(ctx, prompt, true); auto inp = ::llama_tokenize(ctx, prompt, true, false);
if (inp.size() > n_batch) { if (inp.size() > n_batch) {
inp.resize(n_batch); inp.resize(n_batch);
} }
inputs.push_back(inp); inputs.push_back(inp);
} }
// add eos if not present
for (auto & inp : inputs) {
if (inp.empty() || inp.back() != llama_token_eos(model)) {
inp.push_back(llama_token_eos(model));
}
}
// tokenization stats // tokenization stats
if (params.verbose_prompt) { if (params.verbose_prompt) {
for (int i = 0; i < (int) inputs.size(); i++) { for (int i = 0; i < (int) inputs.size(); i++) {
@ -168,15 +175,26 @@ int main(int argc, char ** argv) {
float * out = emb + p * n_embd; float * out = emb + p * n_embd;
batch_decode(ctx, batch, out, s, n_embd); batch_decode(ctx, batch, out, s, n_embd);
// print first 3 embeddings // print the first part of the embeddings
for (int j = 0; j < std::min(3, n_prompts); j++) { fprintf(stdout, "\n");
fprintf(stderr, "embedding %d: ", j); for (int j = 0; j < n_prompts; j++) {
for (int i = 0; i < n_embd; i++) { fprintf(stdout, "embedding %d: ", j);
fprintf(stderr, "%f ", emb[j * n_embd + i]); for (int i = 0; i < std::min(16, n_embd); i++) {
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
} }
fprintf(stderr, "\n\n"); fprintf(stdout, "\n");
}
// print cosine similarity matrix
fprintf(stdout, "\n");
printf("cosine similarity matrix:\n\n");
for (int i = 0; i < n_prompts; i++) {
for (int j = 0; j < n_prompts; j++) {
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
fprintf(stdout, "%6.2f ", sim);
}
fprintf(stdout, "\n");
} }
fprintf(stderr, "\n");
// clean up // clean up
llama_print_timings(ctx); llama_print_timings(ctx);

View file

@ -211,6 +211,7 @@ static bool gguf_ex_read_1(const std::string & fname) {
for (int j = 0; j < ggml_nelements(cur); ++j) { for (int j = 0; j < ggml_nelements(cur); ++j) {
if (data[j] != 100 + i) { if (data[j] != 100 + i) {
fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]); fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
gguf_free(ctx);
return false; return false;
} }
} }

View file

@ -6,22 +6,6 @@
// #define GRIT_DEBUG // #define GRIT_DEBUG
static float dot_product(const std::vector<float> & v1, const std::vector<float> & v2) {
float dot = 0.0f;
for (uint64_t i = 0; i < v1.size(); ++i) {
dot += v1[i] * v2[i];
}
return dot;
}
static float norm(const std::vector<float> & v) {
return std::sqrt(dot_product(v, v));
}
static float cosine_similarity(const std::vector<float> & v1, const std::vector<float> & v2) {
return dot_product(v1, v2) / (norm(v1) * norm(v2));
}
static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) { static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
std::vector<std::vector<float>> result; std::vector<std::vector<float>> result;
@ -203,10 +187,12 @@ int main(int argc, char * argv[]) {
const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction("")); const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
const std::vector<std::vector<float>> q_rep = encode(ctx, queries, gritlm_instruction(instruction)); const std::vector<std::vector<float>> q_rep = encode(ctx, queries, gritlm_instruction(instruction));
const float cosine_sim_q0_d0 = cosine_similarity(q_rep[0], d_rep[0]); const int n_embd = llama_n_embd(mdl);
const float cosine_sim_q0_d1 = cosine_similarity(q_rep[0], d_rep[1]);
const float cosine_sim_q1_d0 = cosine_similarity(q_rep[1], d_rep[0]); const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
const float cosine_sim_q1_d1 = cosine_similarity(q_rep[1], d_rep[1]); const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
const float cosine_sim_q1_d0 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd);
const float cosine_sim_q1_d1 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd);
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0); std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1); std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);

View file

@ -104,6 +104,7 @@ static std::string get_cpu_info() {
} }
} }
} }
fclose(f);
} }
#endif #endif
// TODO: other platforms // TODO: other platforms

View file

@ -63,12 +63,20 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director
```console ```console
git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
``` ```
2) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
2) Install the required Python packages:
```sh
pip install -r examples/llava/requirements.txt
```
3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
```console ```console
python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/ python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/
``` ```
- you will find a llava.projector and a llava.clip file in your model directory - you will find a llava.projector and a llava.clip file in your model directory
3) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
```console ```console
mkdir vit mkdir vit
cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin
@ -76,18 +84,18 @@ cp ../llava-v1.6-vicuna-7b/llava.projector vit/
curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json
``` ```
4) Create the visual gguf model: 5) Create the visual gguf model:
```console ```console
python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
``` ```
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
5) Then convert the model to gguf format: 6) Then convert the model to gguf format:
```console ```console
python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown
``` ```
6) And finally we can run the llava-cli using the 1.6 model version: 7) And finally we can run the llava-cli using the 1.6 model version:
```console ```console
./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096 ./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
``` ```

View file

@ -995,6 +995,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
if (!new_clip->ctx_data) { if (!new_clip->ctx_data) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__); fprintf(stderr, "%s: ggml_init() failed\n", __func__);
clip_free(new_clip); clip_free(new_clip);
gguf_free(ctx);
return nullptr; return nullptr;
} }
@ -1002,6 +1003,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
if (!fin) { if (!fin) {
printf("cannot open model file for loading tensors\n"); printf("cannot open model file for loading tensors\n");
clip_free(new_clip); clip_free(new_clip);
gguf_free(ctx);
return nullptr; return nullptr;
} }
@ -1023,6 +1025,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
if (!fin) { if (!fin) {
printf("%s: failed to seek for tensor %s\n", __func__, name); printf("%s: failed to seek for tensor %s\n", __func__, name);
clip_free(new_clip); clip_free(new_clip);
gguf_free(ctx);
return nullptr; return nullptr;
} }
int num_bytes = ggml_nbytes(cur); int num_bytes = ggml_nbytes(cur);
@ -1908,6 +1911,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
break; break;
default: default:
printf("Please use an input file in f32 or f16\n"); printf("Please use an input file in f32 or f16\n");
gguf_free(ctx_out);
return false; return false;
} }

View file

@ -119,6 +119,10 @@ def step_server_metrics(context):
def step_start_server(context): def step_start_server(context):
start_server_background(context) start_server_background(context)
attempts = 0 attempts = 0
max_attempts = 20
if 'GITHUB_ACTIONS' in os.environ:
max_attempts *= 2
while True: while True:
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
result = sock.connect_ex((context.server_fqdn, context.server_port)) result = sock.connect_ex((context.server_fqdn, context.server_port))
@ -126,7 +130,7 @@ def step_start_server(context):
print("\x1b[33;46mserver started!\x1b[0m") print("\x1b[33;46mserver started!\x1b[0m")
return return
attempts += 1 attempts += 1
if attempts > 20: if attempts > max_attempts:
assert False, "server not started" assert False, "server not started"
print(f"waiting for server to start, connect error code = {result}...") print(f"waiting for server to start, connect error code = {result}...")
time.sleep(0.1) time.sleep(0.1)
@ -943,6 +947,9 @@ async def wait_for_health_status(context,
print(f"Starting checking for health for expected_health_status={expected_health_status}\n") print(f"Starting checking for health for expected_health_status={expected_health_status}\n")
interval = 0.5 interval = 0.5
counter = 0 counter = 0
if 'GITHUB_ACTIONS' in os.environ:
timeout *= 2
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
while True: while True:
async with await session.get(f'{base_url}/health', params=params) as health_response: async with await session.get(f'{base_url}/health', params=params) as health_response:

View file

@ -711,6 +711,7 @@ static bool load_checkpoint_file(const char * filename, struct my_llama_model *
load_checkpoint_gguf(fctx, f_ggml_ctx, model, train); load_checkpoint_gguf(fctx, f_ggml_ctx, model, train);
gguf_free(fctx);
return true; return true;
} }

View file

@ -280,6 +280,11 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
id<MTLLibrary> metal_library; id<MTLLibrary> metal_library;
// load library // load library
//
// - first check if the library is embedded
// - then check if the library is in the bundle
// - if not found, load the source and compile it
// - if that fails, return NULL
{ {
NSBundle * bundle = nil; NSBundle * bundle = nil;
#ifdef SWIFT_PACKAGE #ifdef SWIFT_PACKAGE
@ -287,12 +292,21 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
#else #else
bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
#endif #endif
NSError * error = nil; NSError * error = nil;
NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
if (libPath != nil) { #if GGML_METAL_EMBED_LIBRARY
const bool try_metallib = false;
#else
const bool try_metallib = true;
#endif
NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
if (try_metallib && path_lib != nil) {
// pre-compiled library found // pre-compiled library found
NSURL * libURL = [NSURL fileURLWithPath:libPath]; NSURL * libURL = [NSURL fileURLWithPath:path_lib];
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]); GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
metal_library = [ctx->device newLibraryWithURL:libURL error:&error]; metal_library = [ctx->device newLibraryWithURL:libURL error:&error];
if (error) { if (error) {
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
@ -305,31 +319,34 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
extern const char ggml_metallib_start[]; extern const char ggml_metallib_start[];
extern const char ggml_metallib_end[]; extern const char ggml_metallib_end[];
NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding]; NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
#else #else
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__); GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
NSString * sourcePath; NSString * path_source;
NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"]; NSString * path_resource = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, ggmlMetalPathResources ? [ggmlMetalPathResources UTF8String] : "nil"); GGML_METAL_LOG_INFO("%s: GGML_METAL_PATH_RESOURCES = %s\n", __func__, path_resource ? [path_resource UTF8String] : "nil");
if (ggmlMetalPathResources) { if (path_resource) {
sourcePath = [ggmlMetalPathResources stringByAppendingPathComponent:@"ggml-metal-merged.metal"]; path_source = [path_resource stringByAppendingPathComponent:@"ggml-metal-merged.metal"];
} else { } else {
sourcePath = [bundle pathForResource:@"ggml-metal-merged" ofType:@"metal"]; path_source = [bundle pathForResource:@"ggml-metal-merged" ofType:@"metal"];
} }
if (sourcePath == nil) {
if (path_source == nil) {
GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal-merged.metal, falling back to trying cwd\n", __func__); GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal-merged.metal, falling back to trying cwd\n", __func__);
sourcePath = @"ggml-metal-merged.metal"; path_source = @"ggml-metal.metal";
} }
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error]; GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
if (error) { if (error) {
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]); GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL; return NULL;
} }
#endif #endif // GGML_METAL_EMBED_LIBRARY
@autoreleasepool { @autoreleasepool {
// dictionary of preprocessor macros // dictionary of preprocessor macros

64
ggml.h
View file

@ -344,24 +344,24 @@ extern "C" {
struct ggml_object; struct ggml_object;
struct ggml_context; struct ggml_context;
// NOTE: always add types at the end of the enum to keep backward compatibility
enum ggml_type { enum ggml_type {
GGML_TYPE_F32 = 0, GGML_TYPE_F32 = 0,
GGML_TYPE_F16 = 1, GGML_TYPE_F16 = 1,
GGML_TYPE_Q4_0 = 2, GGML_TYPE_Q4_0 = 2,
GGML_TYPE_Q4_1 = 3, GGML_TYPE_Q4_1 = 3,
// GGML_TYPE_Q4_2 = 4, support has been removed // GGML_TYPE_Q4_2 = 4, support has been removed
// GGML_TYPE_Q4_3 (5) support has been removed // GGML_TYPE_Q4_3 = 5, support has been removed
GGML_TYPE_Q5_0 = 6, GGML_TYPE_Q5_0 = 6,
GGML_TYPE_Q5_1 = 7, GGML_TYPE_Q5_1 = 7,
GGML_TYPE_Q8_0 = 8, GGML_TYPE_Q8_0 = 8,
GGML_TYPE_Q8_1 = 9, GGML_TYPE_Q8_1 = 9,
// k-quantizations GGML_TYPE_Q2_K = 10,
GGML_TYPE_Q2_K = 10, GGML_TYPE_Q3_K = 11,
GGML_TYPE_Q3_K = 11, GGML_TYPE_Q4_K = 12,
GGML_TYPE_Q4_K = 12, GGML_TYPE_Q5_K = 13,
GGML_TYPE_Q5_K = 13, GGML_TYPE_Q6_K = 14,
GGML_TYPE_Q6_K = 14, GGML_TYPE_Q8_K = 15,
GGML_TYPE_Q8_K = 15,
GGML_TYPE_IQ2_XXS = 16, GGML_TYPE_IQ2_XXS = 16,
GGML_TYPE_IQ2_XS = 17, GGML_TYPE_IQ2_XS = 17,
GGML_TYPE_IQ3_XXS = 18, GGML_TYPE_IQ3_XXS = 18,
@ -370,9 +370,9 @@ extern "C" {
GGML_TYPE_IQ3_S = 21, GGML_TYPE_IQ3_S = 21,
GGML_TYPE_IQ2_S = 22, GGML_TYPE_IQ2_S = 22,
GGML_TYPE_IQ4_XS = 23, GGML_TYPE_IQ4_XS = 23,
GGML_TYPE_I8, GGML_TYPE_I8 = 24,
GGML_TYPE_I16, GGML_TYPE_I16 = 25,
GGML_TYPE_I32, GGML_TYPE_I32 = 26,
GGML_TYPE_COUNT, GGML_TYPE_COUNT,
}; };
@ -390,20 +390,20 @@ extern "C" {
// model file types // model file types
enum ggml_ftype { enum ggml_ftype {
GGML_FTYPE_UNKNOWN = -1, GGML_FTYPE_UNKNOWN = -1,
GGML_FTYPE_ALL_F32 = 0, GGML_FTYPE_ALL_F32 = 0,
GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors

View file

@ -32,6 +32,7 @@ class Keys:
FILE_TYPE = "general.file_type" FILE_TYPE = "general.file_type"
class LLM: class LLM:
VOCAB_SIZE = "{arch}.vocab_size"
CONTEXT_LENGTH = "{arch}.context_length" CONTEXT_LENGTH = "{arch}.context_length"
EMBEDDING_LENGTH = "{arch}.embedding_length" EMBEDDING_LENGTH = "{arch}.embedding_length"
BLOCK_COUNT = "{arch}.block_count" BLOCK_COUNT = "{arch}.block_count"
@ -661,6 +662,9 @@ class GGMLQuantizationType(IntEnum):
IQ3_S = 21 IQ3_S = 21
IQ2_S = 22 IQ2_S = 22
IQ4_XS = 23 IQ4_XS = 23
I8 = 24
I16 = 25
I32 = 26
class GGUFEndian(IntEnum): class GGUFEndian(IntEnum):
@ -727,6 +731,9 @@ GGML_QUANT_SIZES = {
GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4), GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4),
GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16), GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16),
GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64), GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64),
GGMLQuantizationType.I8: (1, 1),
GGMLQuantizationType.I16: (1, 2),
GGMLQuantizationType.I32: (1, 4),
} }
@ -746,6 +753,7 @@ KEY_GENERAL_SOURCE_HF_REPO = Keys.General.SOURCE_HF_REPO
KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE
# LLM # LLM
KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE
KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH
KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH
KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT

View file

@ -248,6 +248,15 @@ class GGUFReader:
elif ggml_type == GGMLQuantizationType.F16: elif ggml_type == GGMLQuantizationType.F16:
item_count = n_elems item_count = n_elems
item_type = np.float16 item_type = np.float16
elif ggml_type == GGMLQuantizationType.I8:
item_count = n_elems
item_type = np.int8
elif ggml_type == GGMLQuantizationType.I16:
item_count = n_elems
item_type = np.int16
elif ggml_type == GGMLQuantizationType.I32:
item_count = n_elems
item_type = np.int32
else: else:
item_count = n_bytes item_count = n_bytes
item_type = np.uint8 item_type = np.uint8

View file

@ -196,9 +196,6 @@ class GGUFWriter:
if self.state is not WriterState.EMPTY: if self.state is not WriterState.EMPTY:
raise ValueError(f'Expected output file to be empty, got {self.state}') raise ValueError(f'Expected output file to be empty, got {self.state}')
if raw_dtype is None and tensor_dtype not in (np.float32, np.float16):
raise ValueError("Only F32 and F16 tensors are supported for now")
encoded_name = name.encode("utf8") encoded_name = name.encode("utf8")
self.ti_data += self._pack("Q", len(encoded_name)) self.ti_data += self._pack("Q", len(encoded_name))
self.ti_data += encoded_name self.ti_data += encoded_name
@ -207,7 +204,18 @@ class GGUFWriter:
for i in range(n_dims): for i in range(n_dims):
self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i]) self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
if raw_dtype is None: if raw_dtype is None:
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16 if tensor_dtype == np.float32:
dtype = GGMLQuantizationType.F32
elif tensor_dtype == np.float16:
dtype = GGMLQuantizationType.F16
elif tensor_dtype == np.int8:
dtype = GGMLQuantizationType.I8
elif tensor_dtype == np.int16:
dtype = GGMLQuantizationType.I16
elif tensor_dtype == np.int32:
dtype = GGMLQuantizationType.I32
else:
raise ValueError("Only F32, F16, I8, I16, I32 tensors are supported for now")
else: else:
dtype = raw_dtype dtype = raw_dtype
self.ti_data += self._pack("I", dtype) self.ti_data += self._pack("I", dtype)
@ -313,6 +321,9 @@ class GGUFWriter:
self.data_alignment = alignment self.data_alignment = alignment
self.add_uint32(Keys.General.ALIGNMENT, alignment) self.add_uint32(Keys.General.ALIGNMENT, alignment)
def add_vocab_size(self, size: int) -> None:
self.add_uint32(Keys.LLM.VOCAB_SIZE.format(arch=self.arch), size)
def add_context_length(self, length: int) -> None: def add_context_length(self, length: int) -> None:
self.add_uint32(Keys.LLM.CONTEXT_LENGTH.format(arch=self.arch), length) self.add_uint32(Keys.LLM.CONTEXT_LENGTH.format(arch=self.arch), length)

View file

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "gguf" name = "gguf"
version = "0.7.0" version = "0.8.0"
description = "Read and write ML models in GGUF for GGML" description = "Read and write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"] authors = ["GGML <ggml@ggml.ai>"]
packages = [ packages = [

View file

@ -616,7 +616,7 @@ maxhordelen = 256
modelbusy = threading.Lock() modelbusy = threading.Lock()
requestsinqueue = 0 requestsinqueue = 0
defaultport = 5001 defaultport = 5001
KcppVersion = "1.61.2" KcppVersion = "1.62"
showdebug = True showdebug = True
showsamplerwarning = True showsamplerwarning = True
showmaxctxwarning = True showmaxctxwarning = True

126
llama.cpp
View file

@ -282,6 +282,7 @@ enum llm_kv {
LLM_KV_GENERAL_SOURCE_URL, LLM_KV_GENERAL_SOURCE_URL,
LLM_KV_GENERAL_SOURCE_HF_REPO, LLM_KV_GENERAL_SOURCE_HF_REPO,
LLM_KV_VOCAB_SIZE,
LLM_KV_CONTEXT_LENGTH, LLM_KV_CONTEXT_LENGTH,
LLM_KV_EMBEDDING_LENGTH, LLM_KV_EMBEDDING_LENGTH,
LLM_KV_BLOCK_COUNT, LLM_KV_BLOCK_COUNT,
@ -345,6 +346,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" }, { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" }, { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" }, { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" }, { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
{ LLM_KV_BLOCK_COUNT, "%s.block_count" }, { LLM_KV_BLOCK_COUNT, "%s.block_count" },
@ -3288,10 +3290,11 @@ static const char * llama_model_type_name(e_model type) {
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
switch (type) { switch (type) {
case LLAMA_VOCAB_TYPE_SPM: return "SPM"; case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
case LLAMA_VOCAB_TYPE_BPE: return "BPE"; case LLAMA_VOCAB_TYPE_SPM: return "SPM";
case LLAMA_VOCAB_TYPE_WPM: return "WPM"; case LLAMA_VOCAB_TYPE_BPE: return "BPE";
default: return "unknown"; case LLAMA_VOCAB_TYPE_WPM: return "WPM";
default: return "unknown";
} }
} }
@ -3323,14 +3326,14 @@ static void llm_load_hparams(
ml.get_key(LLM_KV_GENERAL_NAME, model.name, false); ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
// get hparams kv // get hparams kv
ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab); ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
ml.get_key (LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train); ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
ml.get_key (LLM_KV_EMBEDDING_LENGTH, hparams.n_embd); ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff); ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head); ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer); ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
ml.get_key (LLM_KV_EXPERT_COUNT, hparams.n_expert, false); ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
ml.get_key (LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS); GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert); GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
@ -3692,30 +3695,25 @@ static void llm_load_vocab(
const auto kv = LLM_KV(model.arch); const auto kv = LLM_KV(model.arch);
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
if (token_idx == -1) {
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
}
const float * scores = nullptr;
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
if (score_idx != -1) {
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
}
const int * toktypes = nullptr;
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
if (toktype_idx != -1) {
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
}
// determine vocab type // determine vocab type
{ {
std::string tokenizer_name; std::string tokenizer_name;
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name); ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
if (tokenizer_name == "llama") { if (tokenizer_name == "no_vocab") {
vocab.type = LLAMA_VOCAB_TYPE_NONE;
// default special tokens
vocab.special_bos_id = -1;
vocab.special_eos_id = -1;
vocab.special_unk_id = -1;
vocab.special_sep_id = -1;
vocab.special_pad_id = -1;
vocab.linefeed_id = -1;
return;
} else if (tokenizer_name == "llama") {
vocab.type = LLAMA_VOCAB_TYPE_SPM; vocab.type = LLAMA_VOCAB_TYPE_SPM;
// default special tokens // default special tokens
@ -3790,6 +3788,23 @@ static void llm_load_vocab(
} }
} }
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
if (token_idx == -1) {
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
}
const float * scores = nullptr;
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
if (score_idx != -1) {
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
}
const int * toktypes = nullptr;
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
if (toktype_idx != -1) {
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
}
const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx); const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
vocab.id_to_token.resize(n_vocab); vocab.id_to_token.resize(n_vocab);
@ -3997,7 +4012,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert); LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used); LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
LLAMA_LOG_INFO("%s: causal attm = %d\n", __func__, hparams.causal_attn); LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type); LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type); LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type); LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
@ -5095,7 +5110,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
llm_load_print_meta(ml, model); llm_load_print_meta(ml, model);
if (model.hparams.n_vocab != model.vocab.id_to_token.size()) { if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
model.hparams.n_vocab != model.vocab.id_to_token.size()) {
throw std::runtime_error("vocab size mismatch"); throw std::runtime_error("vocab size mismatch");
} }
@ -9108,8 +9124,8 @@ static int llama_decode_internal(
//llama_synchronize(&lctx); //llama_synchronize(&lctx);
// decide if we need to defrag the kv cache // decide if we need to defrag the kv cache
if (cparams.defrag_thold >= 0.0f) { if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens_all)/float(kv_self.n) : 0.0f; const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
// queue defragmentation for next llama_kv_cache_update // queue defragmentation for next llama_kv_cache_update
if (fragmentation > cparams.defrag_thold) { if (fragmentation > cparams.defrag_thold) {
@ -9141,6 +9157,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
// number of cells moved // number of cells moved
uint32_t n_moves = 0; uint32_t n_moves = 0;
// each move requires 6*n_layer tensors (see build_defrag)
// - source view, destination view, copy operation
// - x2 for keys and values
const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
// determine which KV cells to move where // determine which KV cells to move where
// //
// cell i moves to ids[i] // cell i moves to ids[i]
@ -9167,15 +9188,6 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
nh++; nh++;
} }
// each move requires 6*n_layer tensors (see build_defrag)
// - source view, destination view, copy operation
// - x2 for keys and values
//
if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
// the graph is too big, we cannot move more cells
break;
}
uint32_t nf = 0; uint32_t nf = 0;
uint32_t is = n_kv - 1; uint32_t is = n_kv - 1;
@ -9205,11 +9217,19 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
// are we moving a continuous block of memory? // are we moving a continuous block of memory?
bool cont = false; bool cont = false;
// should we stop searching for the next move?
bool stop = false;
// go back and move the nf cells to the hole // go back and move the nf cells to the hole
for (; i1 < n_kv; ++i1) { for (; i1 < n_kv; ++i1) {
auto & cell1 = kv_self.cells[i1]; auto & cell1 = kv_self.cells[i1];
if (cell1.is_empty() || ids[i1] != n_kv) { if (cell1.is_empty() || ids[i1] != n_kv) {
if (n_moves == max_moves) {
stop = true;
break;
}
cont = false; cont = false;
continue; continue;
} }
@ -9236,6 +9256,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
} }
} }
if (stop || n_moves == max_moves) {
break;
}
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh); //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
i0 += nh - 1; i0 += nh - 1;
@ -9425,26 +9449,32 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
} }
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) { static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL; return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
} }
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) { static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN; return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
} }
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) { static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL; return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
} }
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) { static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE; return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
} }
static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) { static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED; return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
} }
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
GGML_ASSERT(llama_is_byte_token(vocab, id)); GGML_ASSERT(llama_is_byte_token(vocab, id));
const auto& token_data = vocab.id_to_token.at(id); const auto& token_data = vocab.id_to_token.at(id);
switch (llama_vocab_get_type(vocab)) { switch (llama_vocab_get_type(vocab)) {
@ -9466,6 +9496,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
} }
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
static const char * hex = "0123456789ABCDEF"; static const char * hex = "0123456789ABCDEF";
switch (llama_vocab_get_type(vocab)) { switch (llama_vocab_get_type(vocab)) {
case LLAMA_VOCAB_TYPE_SPM: { case LLAMA_VOCAB_TYPE_SPM: {
@ -10527,6 +10558,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
} }
} }
} break; } break;
case LLAMA_VOCAB_TYPE_NONE:
GGML_ASSERT(false);
} }
return output; return output;
@ -12261,7 +12294,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
return new_type; return new_type;
} }
static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) { static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
std::mutex mutex; std::mutex mutex;
int counter = 0; int counter = 0;
size_t new_size = 0; size_t new_size = 0;
@ -13437,7 +13470,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
} }
int32_t llama_n_vocab(const struct llama_model * model) { int32_t llama_n_vocab(const struct llama_model * model) {
return model->vocab.id_to_token.size(); return model->hparams.n_vocab;
} }
int32_t llama_n_ctx_train(const struct llama_model * model) { int32_t llama_n_ctx_train(const struct llama_model * model) {
@ -14271,14 +14304,17 @@ float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id
} }
const char * llama_token_get_text(const struct llama_model * model, llama_token token) { const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
return model->vocab.id_to_token[token].text.c_str(); return model->vocab.id_to_token[token].text.c_str();
} }
float llama_token_get_score(const struct llama_model * model, llama_token token) { float llama_token_get_score(const struct llama_model * model, llama_token token) {
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
return model->vocab.id_to_token[token].score; return model->vocab.id_to_token[token].score;
} }
llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) { llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
return model->vocab.id_to_token[token].type; return model->vocab.id_to_token[token].type;
} }

View file

@ -59,9 +59,10 @@ extern "C" {
typedef int32_t llama_seq_id; typedef int32_t llama_seq_id;
enum llama_vocab_type { enum llama_vocab_type {
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece
LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding
LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
}; };
// note: these values should be synchronized with ggml_rope // note: these values should be synchronized with ggml_rope