diff --git a/common/common.cpp b/common/common.cpp index 06f96a23b..d04c37d7f 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -168,6 +168,24 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (params.n_threads_batch <= 0) { params.n_threads_batch = std::thread::hardware_concurrency(); } + } else if (arg == "-td" || arg == "--threads-draft") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads_draft = std::stoi(argv[i]); + if (params.n_threads_draft <= 0) { + params.n_threads_draft = std::thread::hardware_concurrency(); + } + } else if (arg == "-tbd" || arg == "--threads-batch-draft") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads_batch_draft = std::stoi(argv[i]); + if (params.n_threads_batch_draft <= 0) { + params.n_threads_batch_draft = std::thread::hardware_concurrency(); + } } else if (arg == "-p" || arg == "--prompt") { if (++i >= argc) { invalid_param = true; @@ -664,6 +682,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.hellaswag_tasks = std::stoi(argv[i]); + } else if (arg == "--winogrande") { + params.winogrande = true; + } else if (arg == "--winogrande-tasks") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.winogrande_tasks = std::stoi(argv[i]); } else if (arg == "--ignore-eos") { params.ignore_eos = true; } else if (arg == "--no-penalize-nl") { @@ -846,6 +872,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads); printf(" -tb N, --threads-batch N\n"); printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n"); + printf(" -td N, --threads-draft N"); + printf(" number of threads to use during generation (default: same as --threads)"); + printf(" -tbd N, --threads-batch-draft N\n"); + printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n"); printf(" -p PROMPT, --prompt PROMPT\n"); printf(" prompt to start generation with (default: empty)\n"); printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); @@ -905,6 +935,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n"); printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"); printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); + printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n"); + printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks); printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft); printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); diff --git a/common/common.h b/common/common.h index 9b92e4297..013818f0e 100644 --- a/common/common.h +++ b/common/common.h @@ -40,7 +40,9 @@ struct gpt_params { uint32_t seed = -1; // RNG seed int32_t n_threads = get_num_physical_cores(); + int32_t n_threads_draft = -1; int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) + int32_t n_threads_batch_draft = -1; int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 512; // context size int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) @@ -115,6 +117,9 @@ struct gpt_params { bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score + bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt + size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed + bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs diff --git a/common/sampling.cpp b/common/sampling.cpp index 8e45909f1..dd1ffeb1b 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -190,6 +190,11 @@ static llama_token llama_sampling_sample_impl( logits[it->first] += it->second; } + if (ctx_cfg) { + float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx); + llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale); + } + cur.clear(); for (llama_token token_id = 0; token_id < n_vocab; token_id++) { @@ -198,10 +203,6 @@ static llama_token llama_sampling_sample_impl( llama_token_data_array cur_p = { cur.data(), cur.size(), false }; - if (ctx_cfg) { - llama_sample_classifier_free_guidance(ctx_main, &cur_p, ctx_cfg, params.cfg_scale); - } - // apply penalties const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev; const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n); diff --git a/common/sampling.h b/common/sampling.h index f3898a33c..baade2b85 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -17,7 +17,7 @@ typedef struct llama_sampling_params { float min_p = 0.05f; // 0.0 = disabled float tfs_z = 1.00f; // 1.0 = disabled float typical_p = 1.00f; // 1.0 = disabled - float temp = 0.80f; // 1.0 = disabled + float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) float penalty_repeat = 1.10f; // 1.0 = disabled float penalty_freq = 0.00f; // 0.0 = disabled diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index b133f3b49..4d995ef78 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -10,7 +10,7 @@ import re import sys from enum import IntEnum from pathlib import Path -from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast, Optional +from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast import numpy as np import torch @@ -189,6 +189,8 @@ class Model: return StableLMModel if model_architecture == "QWenLMHeadModel": return QwenModel + if model_architecture == "Qwen2ForCausalLM": + return Model if model_architecture == "MixtralForCausalLM": return MixtralModel if model_architecture == "GPT2LMHeadModel": @@ -197,6 +199,8 @@ class Model: return Phi2Model if model_architecture == "PlamoForCausalLM": return PlamoModel + if model_architecture == "CodeShellForCausalLM": + return CodeShellModel return Model def _is_model_safetensors(self) -> bool: @@ -234,6 +238,8 @@ class Model: return gguf.MODEL_ARCH.STABLELM if arch == "QWenLMHeadModel": return gguf.MODEL_ARCH.QWEN + if arch == "Qwen2ForCausalLM": + return gguf.MODEL_ARCH.QWEN2 if arch == "MixtralForCausalLM": return gguf.MODEL_ARCH.LLAMA if arch == "GPT2LMHeadModel": @@ -242,6 +248,8 @@ class Model: return gguf.MODEL_ARCH.PHI2 if arch == "PlamoForCausalLM": return gguf.MODEL_ARCH.PLAMO + if arch == "CodeShellForCausalLM": + return gguf.MODEL_ARCH.CODESHELL raise NotImplementedError(f'Architecture "{arch}" not supported!') @@ -266,11 +274,10 @@ class Model: toktypes.append(gguf.TokenType.USER_DEFINED) elif reverse_vocab[i] in added_vocab: tokens.append(reverse_vocab[i]) - if hasattr(tokenizer, "added_tokens_decoder"): - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) + if tokenizer.added_tokens_decoder[i].special: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) else: tokens.append(reverse_vocab[i]) toktypes.append(gguf.TokenType.NORMAL) @@ -480,7 +487,8 @@ class MPTModel(Model): # map tensor names if "scales" in name: new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales")) - new_name = new_name.replace("scales", "act.scales") + if new_name is not None: + new_name = new_name.replace("scales", "act.scales") else: new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: @@ -897,7 +905,7 @@ class QwenModel(Model): return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) @staticmethod - def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]: + def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: parts = [bytes([b]) for b in token] while True: min_idx = None @@ -1177,6 +1185,70 @@ class PlamoModel(Model): self.gguf_writer.add_tensor(new_name, data) +class CodeShellModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams["n_layer"] + + self.gguf_writer.add_name("CodeShell") + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_rope_freq_base(10000.0) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(1.0) + + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + tensors = dict(self.get_tensors()) + has_lm_head = "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys() + for name, data_torch in tensors.items(): + # we don't need these + if name.endswith((".attn.rotary_emb.inv_freq")): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + if not has_lm_head and name == "transformer.wte.weight": + self.gguf_writer.add_tensor("output.weight", data) + print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") + ###### CONVERSION LOGIC ###### @@ -1214,7 +1286,7 @@ def main() -> None: if args.awq_path: sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) - from awq.apply_awq import add_scale_weights + from awq.apply_awq import add_scale_weights # type: ignore[import-not-found] tmp_model_path = args.model / "weighted_model" dir_model = tmp_model_path if tmp_model_path.is_dir(): diff --git a/convert-llama-ggml-to-gguf.py b/convert-llama-ggml-to-gguf.py index e359330af..b33108062 100755 --- a/convert-llama-ggml-to-gguf.py +++ b/convert-llama-ggml-to-gguf.py @@ -2,6 +2,7 @@ from __future__ import annotations import argparse +import os import struct import sys from enum import IntEnum @@ -9,7 +10,6 @@ from pathlib import Path import numpy as np -import os if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf @@ -371,15 +371,11 @@ def handle_metadata(cfg, hp): params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path) else: raise ValueError('Unable to load metadata') - vocab = convert.load_vocab( - cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, - cfg.vocabtype) - # FIXME: Respect cfg.vocab_dir? - svocab = gguf.SpecialVocab(cfg.model_metadata_dir, - load_merges = cfg.vocabtype == 'bpe', - n_vocab = vocab.vocab_size) + vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir) + vocab_factory = convert.VocabFactory(vocab_path) + vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir) convert.check_vocab_size(params, vocab) - return (params, vocab, svocab) + return params, vocab, special_vocab def handle_args(): diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py index 35ce152f4..4904bf128 100755 --- a/convert-lora-to-ggml.py +++ b/convert-lora-to-ggml.py @@ -5,17 +5,16 @@ import json import os import struct import sys +from pathlib import Path from typing import Any, BinaryIO, Sequence import numpy as np import torch -from pathlib import Path if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) import gguf - NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1} diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py index 1ba5864dc..d2be805d1 100755 --- a/convert-persimmon-to-gguf.py +++ b/convert-persimmon-to-gguf.py @@ -1,11 +1,13 @@ #!/usr/bin/env python3 -import torch -import os -from pprint import pprint -import sys import argparse +import os +import sys from pathlib import Path +from pprint import pprint + +import torch from sentencepiece import SentencePieceProcessor + if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf @@ -69,7 +71,7 @@ def main(): persimmon_model = torch.load(args.ckpt_path) hparams = persimmon_model['args'] pprint(hparams) - tensors = {} + tensors: dict[str, torch.Tensor] = {} _flatten_dict(persimmon_model['model'], tensors, None) arch = gguf.MODEL_ARCH.PERSIMMON diff --git a/convert.py b/convert.py index 3b613eefc..06768033d 100755 --- a/convert.py +++ b/convert.py @@ -17,58 +17,28 @@ import signal import struct import sys import time -import warnings import zipfile from abc import ABCMeta, abstractmethod -from argparse import ArgumentParser from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from dataclasses import dataclass from pathlib import Path -from typing import ( - IO, - TYPE_CHECKING, - Any, - Callable, - Iterable, - Literal, - Optional, - Tuple, - TypeVar, -) +from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar import numpy as np from sentencepiece import SentencePieceProcessor -try: - from transformers import AutoTokenizer -except ModuleNotFoundError as e: - warnings.warn(f"Could not import AutoTokenizer from transformers: {e}") +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) +import gguf -# If NO_LOCAL_GGUF is not set, try to import gguf from the local gguf-py directory -if "NO_LOCAL_GGUF" not in os.environ: - # Use absolute path to the gguf-py directory - gguf_py_dir = str(Path(__file__).resolve().parent / "gguf-py") - print(gguf_py_dir) # NOTE: Remove this once path is verified after changes are completed - if gguf_py_dir not in sys.path: - sys.path.insert(1, gguf_py_dir) +if TYPE_CHECKING: + from typing import TypeAlias -# Import gguf module -try: - import gguf -except ModuleNotFoundError as e: - print(f"Could not import gguf: {e}") - sys.exit(1) - -if TYPE_CHECKING: # NOTE: This isn't necessary. - from typing import TypeAlias # This can technically be omitted. - -if hasattr(faulthandler, "register") and hasattr(signal, "SIGUSR1"): +if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'): faulthandler.register(signal.SIGUSR1) -# NOTE: n-dimensional arrays should be directly referenced -NDArray: TypeAlias = "np.ndarray[Any, Any]" +NDArray: TypeAlias = 'np.ndarray[Any, Any]' -# Why is this here? LLAMA and GPT are technically the only compatible ARCHs. ARCH = gguf.MODEL_ARCH.LLAMA DEFAULT_CONCURRENCY = 8 @@ -78,7 +48,6 @@ DEFAULT_CONCURRENCY = 8 # -# TODO: Clean up and refactor data types @dataclass(frozen=True) class DataType: name: str @@ -183,85 +152,65 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = { @dataclass class Params: - n_vocab: int - n_embd: int - n_layer: int - n_ctx: int - n_ff: int - n_head: int - n_head_kv: int - f_norm_eps: Optional[float] = None - n_experts: Optional[int] = None - n_experts_used: Optional[int] = None + n_vocab: int + n_embd: int + n_layer: int + n_ctx: int + n_ff: int + n_head: int + n_head_kv: int + n_experts: int | None = None + n_experts_used: int | None = None + f_norm_eps: float | None = None - rope_scaling_type: Optional[gguf.RopeScalingType] = None - f_rope_freq_base: Optional[float] = None - f_rope_scale: Optional[float] = None - n_orig_ctx: Optional[int] = None - rope_finetuned: Optional[bool] = None + rope_scaling_type: gguf.RopeScalingType | None = None + f_rope_freq_base: float | None = None + f_rope_scale: float | None = None + n_orig_ctx: int | None = None + rope_finetuned: bool | None = None - ftype: Optional[GGMLFileType] = None + ftype: GGMLFileType | None = None # path to the directory containing the model files - path_model: Optional[Path] = None + path_model: Path | None = None @staticmethod - def guessed(model: LazyModel) -> "Params": + def guessed(model: LazyModel) -> Params: # try transformer naming first - n_vocab, n_embd = ( - model["model.embed_tokens.weight"].shape - if "model.embed_tokens.weight" in model - else model["tok_embeddings.weight"].shape - ) + n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape # try transformer naming first if "model.layers.0.self_attn.q_proj.weight" in model: - n_layer = next( - i - for i in itertools.count() - if f"model.layers.{i}.self_attn.q_proj.weight" not in model - ) - elif ( - "model.layers.0.self_attn.W_pack.weight" in model - ): # next: try baichuan naming - n_layer = next( - i - for i in itertools.count() - if f"model.layers.{i}.self_attn.W_pack.weight" not in model - ) + n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model) + elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming + n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model) else: - n_layer = next( - i - for i in itertools.count() - if f"layers.{i}.attention.wq.weight" not in model - ) + n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model) if n_layer < 1: - raise Exception( - "failed to guess 'n_layer'. This model is unknown or unsupported.\n" - "Suggestion: provide 'config.json' of the model in the same directory containing model files." - ) + raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n" + "Suggestion: provide 'config.json' of the model in the same directory containing model files.") - n_head = n_embd // 128 # guessed - n_mult = 256 # guessed + n_head = n_embd // 128 # guessed + n_mult = 256 # guessed # TODO: verify this n_ff = int(2 * (4 * n_embd) / 3) n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult) return Params( - n_vocab=n_vocab, - n_embd=n_embd, - n_layer=n_layer, - n_ctx=-1, - n_ff=n_ff, - n_head=n_head, - n_head_kv=n_head, - f_norm_eps=1e-5, + n_vocab = n_vocab, + n_embd = n_embd, + n_layer = n_layer, + n_ctx = -1, + n_ff = n_ff, + n_head = n_head, + n_head_kv = n_head, + f_norm_eps = 1e-5, ) @staticmethod - def load_transformers_config(model: LazyModel, config_path: Path) -> "Params": + def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: config = json.load(open(config_path)) rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None @@ -274,22 +223,20 @@ class Params: rope_scaling_type = gguf.RopeScalingType.LINEAR elif typ == "yarn": rope_scaling_type = gguf.RopeScalingType.YARN - n_orig_ctx = rope_scaling["original_max_position_embeddings"] - rope_finetuned = rope_scaling["finetuned"] + n_orig_ctx = rope_scaling['original_max_position_embeddings'] + rope_finetuned = rope_scaling['finetuned'] else: - raise NotImplementedError(f"Unknown rope scaling type: {typ}") + raise NotImplementedError(f'Unknown rope scaling type: {typ}') if "max_sequence_length" in config: n_ctx = config["max_sequence_length"] elif "max_position_embeddings" in config: n_ctx = config["max_position_embeddings"] else: - raise Exception( - "failed to guess 'n_ctx'. This model is unknown or unsupported.\n" - "Suggestion: provide 'config.json' of the model in the same directory containing model files." - ) + raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n" + "Suggestion: provide 'config.json' of the model in the same directory containing model files.") - n_experts = None + n_experts = None n_experts_used = None if "num_local_experts" in config: @@ -297,30 +244,30 @@ class Params: n_experts_used = config["num_experts_per_tok"] return Params( - n_vocab=config["vocab_size"], - n_embd=config["hidden_size"], - n_layer=config["num_hidden_layers"], - n_ctx=n_ctx, - n_ff=config["intermediate_size"], - n_head=(n_head := config["num_attention_heads"]), - n_head_kv=config.get("num_key_value_heads", n_head), - n_experts=n_experts, - n_experts_used=n_experts_used, - f_norm_eps=config["rms_norm_eps"], - f_rope_freq_base=config.get("rope_theta"), - rope_scaling_type=rope_scaling_type, - f_rope_scale=f_rope_scale, - n_orig_ctx=n_orig_ctx, - rope_finetuned=rope_finetuned, + n_vocab = config["vocab_size"], + n_embd = config["hidden_size"], + n_layer = config["num_hidden_layers"], + n_ctx = n_ctx, + n_ff = config["intermediate_size"], + n_head = (n_head := config["num_attention_heads"]), + n_head_kv = config.get("num_key_value_heads", n_head), + n_experts = n_experts, + n_experts_used = n_experts_used, + f_norm_eps = config["rms_norm_eps"], + f_rope_freq_base = config.get("rope_theta"), + rope_scaling_type = rope_scaling_type, + f_rope_scale = f_rope_scale, + n_orig_ctx = n_orig_ctx, + rope_finetuned = rope_finetuned, ) # LLaMA v2 70B params.json # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1} @staticmethod - def load_torch_params(model: LazyModel, config_path: Path) -> "Params": + def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: config = json.load(open(config_path)) - n_experts = None + n_experts = None n_experts_used = None f_rope_freq_base = None @@ -343,89 +290,87 @@ class Params: if config.get("moe"): n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0] - n_experts = config["moe"]["num_experts"] + n_experts = config["moe"]["num_experts"] n_experts_used = config["moe"]["num_experts_per_tok"] f_rope_freq_base = 1e6 return Params( - n_vocab=config.get("vocab_size", model["tok_embeddings.weight"].shape[0]), - n_embd=config["dim"], - n_layer=config["n_layers"], - n_ctx=n_ctx, - n_ff=n_ff, - n_head=(n_head := config["n_heads"]), - n_head_kv=config.get("n_kv_heads", n_head), - n_experts=n_experts, - n_experts_used=n_experts_used, - f_norm_eps=config["norm_eps"], - f_rope_freq_base=config.get("rope_theta", f_rope_freq_base), + n_vocab = model["tok_embeddings.weight"].shape[0], + n_embd = config["dim"], + n_layer = config["n_layers"], + n_ctx = n_ctx, + n_ff = n_ff, + n_head = (n_head := config["n_heads"]), + n_head_kv = config.get("n_kv_heads", n_head), + n_experts = n_experts, + n_experts_used = n_experts_used, + f_norm_eps = config["norm_eps"], + f_rope_freq_base = config.get("rope_theta", f_rope_freq_base), ) @staticmethod - def load(model_plus: ModelPlus) -> "Params": - hf_config_path = model_plus.paths[0].parent / "config.json" + def load(model_plus: ModelPlus) -> Params: + hf_config_path = model_plus.paths[0].parent / "config.json" orig_config_path = model_plus.paths[0].parent / "params.json" if hf_config_path.exists(): - params = Params.load_transformers_config(model_plus.model, hf_config_path) + params = Params.loadHFTransformerJson(model_plus.model, hf_config_path) elif orig_config_path.exists(): - params = Params.load_torch_params(model_plus.model, orig_config_path) - elif model_plus.format != "none": + params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path) + elif model_plus.format != 'none': params = Params.guessed(model_plus.model) else: - raise ValueError("Cannot guess params when model format is none") + raise ValueError('Cannot guess params when model format is none') params.path_model = model_plus.paths[0].parent return params -class BpeVocab: # GPT - def __init__( - self, fname_tokenizer: Path, fname_added_tokens: Optional[Path] - ) -> None: - self.bpe_tokenizer = json.loads( - open(str(fname_tokenizer), encoding="utf-8").read() - ) +# +# vocab +# + +class BpeVocab: + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: + self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) + self.vocab = self.bpe_tokenizer["model"]["vocab"] added_tokens: dict[str, int] if fname_added_tokens is not None: # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab. added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) else: # Fall back to trying to find the added tokens in tokenizer.json - tokenizer_json_file = fname_tokenizer.parent / "tokenizer.json" + tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json' if not tokenizer_json_file.is_file(): added_tokens = {} else: tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8")) added_tokens = dict( - (item["content"], item["id"]) - for item in tokenizer_json.get("added_tokens", []) + (item['content'], item['id']) + for item in tokenizer_json.get('added_tokens', []) # Added tokens here can be duplicates of the main vocabulary. - if item["content"] not in self.bpe_tokenizer - ) + if item['content'] not in self.bpe_tokenizer) - vocab_size: int = len(self.bpe_tokenizer) - expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) - actual_ids = sorted(added_tokens.values()) + vocab_size: int = len(self.vocab) + expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) + actual_ids = sorted(added_tokens.values()) if expected_ids != actual_ids: expected_end_id = vocab_size + len(actual_ids) - 1 - raise Exception( - f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}" - ) + raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}") items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) - self.added_tokens_list = [text for (text, idx) in items] + self.added_tokens_dict = added_tokens + self.added_tokens_list = [text for (text, idx) in items] self.vocab_size_base: int = vocab_size - self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer - self.fname_added_tokens = fname_added_tokens + self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) + self.fname_tokenizer = fname_tokenizer + self.fname_added_tokens = fname_added_tokens def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - tokenizer = self.bpe_tokenizer - reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()} + reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()} - for i, _ in enumerate(tokenizer): + for i, _ in enumerate(self.vocab): yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: @@ -441,10 +386,8 @@ class BpeVocab: # GPT return f"" -class SentencePieceVocab: # LlaMa - def __init__( - self, fname_tokenizer: Path, fname_added_tokens: Optional[Path] - ) -> None: +class SentencePieceVocab: + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) added_tokens: dict[str, int] if fname_added_tokens is not None: @@ -454,22 +397,19 @@ class SentencePieceVocab: # LlaMa vocab_size: int = self.sentencepiece_tokenizer.vocab_size() - new_tokens = { - id: piece for piece, id in added_tokens.items() if id >= vocab_size - } + new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size} expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens))) - actual_new_ids = sorted(new_tokens.keys()) + actual_new_ids = sorted(new_tokens.keys()) if expected_new_ids != actual_new_ids: - raise ValueError( - f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}" - ) + raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}") # Token pieces that were added to the base vocabulary. - self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] - self.vocab_size_base = vocab_size - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer + self.added_tokens_dict = added_tokens + self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] + self.vocab_size_base = vocab_size + self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) + self.fname_tokenizer = fname_tokenizer self.fname_added_tokens = fname_added_tokens def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: @@ -510,11 +450,15 @@ class SentencePieceVocab: # LlaMa class HfVocab: - def __init__( - self, - fname_tokenizer: Path, - fname_added_tokens: Optional[Path] = None, - ) -> None: + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None: + try: + from transformers import AutoTokenizer + except ImportError as e: + raise ImportError( + "To use HfVocab, please install the `transformers` package. " + "You can install it with `pip install transformers`." + ) from e + print("fname_tokenizer:", fname_tokenizer) # Allow the tokenizer to default to slow or fast versions. # Explicitly set tokenizer to use local paths. @@ -527,7 +471,7 @@ class HfVocab: # Initialize lists and dictionaries for added tokens self.added_tokens_list = [] self.added_tokens_dict = dict() - self.added_tokens_ids = set() + self.added_tokens_ids = set() # Process added tokens for tok, tokidx in sorted( @@ -548,12 +492,12 @@ class HfVocab: # Set vocabulary sizes self.vocab_size_base = self.tokenizer.vocab_size - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) + self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer + self.fname_tokenizer = fname_tokenizer self.fname_added_tokens = fname_added_tokens - def hf_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]: + def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: reverse_vocab = { id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items() } @@ -571,11 +515,9 @@ class HfVocab: token_id, self.special_ids # Reuse already stored special IDs ) - def get_token_type(self, token_id: int, special_ids: set) -> gguf.TokenType: + def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType: # Determine token type based on whether it's a special token - return ( - gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL - ) + return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL def get_token_score(self, token_id: int) -> float: # Placeholder for actual logic to determine the token's score @@ -587,7 +529,6 @@ class HfVocab: if text in self.specials: toktype = self.get_token_type(self.specials[text], self.special_ids) score = self.get_token_score(self.specials[text]) - else: toktype = gguf.TokenType.USER_DEFINED score = -1000.0 @@ -781,7 +722,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus: else: model = merge_sharded([mp.model for mp in models_plus]) - return ModelPlus(model, paths, format, vocab) + return ModelPlus(model, paths, format, vocab) # pytype: disable=wrong-arg-types def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor: @@ -869,17 +810,13 @@ class LazyUnpickler(pickle.Unpickler): CLASSES: dict[tuple[str, str], Any] = { # getattr used here as a workaround for mypy not being smart enough to determine # the staticmethods have a __func__ attribute. - ("torch._tensor", "_rebuild_from_type_v2"): getattr( - rebuild_from_type_v2, "__func__" - ), - ("torch._utils", "_rebuild_tensor_v2"): getattr( - lazy_rebuild_tensor_v2, "__func__" - ), - ("torch", "BFloat16Storage"): LazyStorageKind(DT_BF16), - ("torch", "HalfStorage"): LazyStorageKind(DT_F16), - ("torch", "FloatStorage"): LazyStorageKind(DT_F32), - ("torch", "IntStorage"): LazyStorageKind(DT_I32), - ("torch", "Tensor"): LazyTensor, + ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'), + ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'), + ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16), + ('torch', 'HalfStorage'): LazyStorageKind(DT_F16), + ('torch', 'FloatStorage'): LazyStorageKind(DT_F32), + ('torch', 'IntStorage'): LazyStorageKind(DT_I32), + ('torch', 'Tensor'): LazyTensor, } def find_class(self, module: str, name: str) -> Any: @@ -966,7 +903,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc executor_class = ProcessPoolExecutor else: executor_class = ThreadPoolExecutor - with executor_class(max_workers = max_workers) as executor: + with executor_class(max_workers=max_workers) as executor: futures: list[concurrent.futures.Future[Out]] = [] done = False for _ in range(concurrency): @@ -1006,6 +943,7 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N ) for i in range(1, pad_count + 1): vocab.added_tokens_dict[f""] = -1 + vocab.added_tokens_list.append(f"") vocab.vocab_size = params.n_vocab return @@ -1019,12 +957,8 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N class OutputFile: - def __init__( - self, fname_out: Path, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE - ) -> None: - self.gguf = gguf.GGUFWriter( - fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess - ) + def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None: + self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) def add_meta_arch(self, params: Params) -> None: name = "LLaMA" @@ -1033,21 +967,16 @@ class OutputFile: if params.n_ctx == 4096: name = "LLaMA v2" elif params.path_model is not None: - name = str(params.path_model.parent).split("/")[-1] + name = str(params.path_model.parent).split('/')[-1] - self.gguf.add_name(name) - self.gguf.add_context_length(params.n_ctx) - self.gguf.add_embedding_length(params.n_embd) - self.gguf.add_block_count(params.n_layer) - self.gguf.add_feed_forward_length(params.n_ff) + self.gguf.add_name (name) + self.gguf.add_context_length (params.n_ctx) + self.gguf.add_embedding_length (params.n_embd) + self.gguf.add_block_count (params.n_layer) + self.gguf.add_feed_forward_length (params.n_ff) self.gguf.add_rope_dimension_count(params.n_embd // params.n_head) - self.gguf.add_head_count(params.n_head) - self.gguf.add_head_count_kv(params.n_head_kv) - - if params.f_norm_eps is None: - raise ValueError("f_norm_eps is None") - - self.gguf.add_layer_norm_rms_eps(params.f_norm_eps) + self.gguf.add_head_count (params.n_head) + self.gguf.add_head_count_kv (params.n_head_kv) if params.n_experts: self.gguf.add_expert_count(params.n_experts) @@ -1055,6 +984,11 @@ class OutputFile: if params.n_experts_used: self.gguf.add_expert_used_count(params.n_experts_used) + if params.f_norm_eps: + self.gguf.add_layer_norm_rms_eps(params.f_norm_eps) + else: + raise ValueError('f_norm_eps is None') + if params.f_rope_freq_base is not None: self.gguf.add_rope_freq_base(params.f_rope_freq_base) @@ -1086,7 +1020,7 @@ class OutputFile: return tokenizer_model - def extract_vocabulary_from_model(self, vocab: Vocab) -> Tuple[list, list, list]: + def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]: tokens = [] scores = [] toktypes = [] @@ -1097,6 +1031,8 @@ class OutputFile: scores.append(score) toktypes.append(toktype) + assert len(tokens) == vocab.vocab_size + return tokens, scores, toktypes def add_meta_vocab(self, vocab: Vocab) -> None: @@ -1119,14 +1055,10 @@ class OutputFile: def add_tensor_info(self, name: str, tensor: LazyTensor) -> None: n_elements = int(np.prod(tensor.shape)) - raw_dtype = getattr(tensor.data_type, "ggml_type", None) - data_type = ( - getattr(tensor.data_type, "quantized_type", None) or tensor.data_type.dtype - ) + raw_dtype = getattr(tensor.data_type, 'ggml_type', None) + data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype data_nbytes = tensor.data_type.elements_to_bytes(n_elements) - self.gguf.add_tensor_info( - name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype - ) + self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype) def write_meta(self) -> None: self.gguf.write_header_to_file() @@ -1140,14 +1072,10 @@ class OutputFile: @staticmethod def write_vocab_only( - fname_out: Path, - params: Params, - vocab: Vocab, - svocab: gguf.SpecialVocab, - endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, - pad_vocab: bool = False, + fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, + endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, ) -> None: - check_vocab_size(params, vocab, pad_vocab=pad_vocab) + check_vocab_size(params, vocab, pad_vocab = pad_vocab) of = OutputFile(fname_out, endianess=endianess) @@ -1175,14 +1103,8 @@ class OutputFile: @staticmethod def write_all( - fname_out: Path, - ftype: GGMLFileType, - params: Params, - model: LazyModel, - vocab: Vocab, - svocab: gguf.SpecialVocab, - concurrency: int = DEFAULT_CONCURRENCY, - endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, + fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, + concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, ) -> None: check_vocab_size(params, vocab, pad_vocab=pad_vocab) @@ -1202,26 +1124,19 @@ class OutputFile: of.write_tensor_info() # tensor data - ndarrays_inner = bounded_parallel_map( - OutputFile.do_item, model.items(), concurrency=concurrency - ) + ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency) if ftype == GGMLFileType.MostlyQ8_0: ndarrays = bounded_parallel_map( - OutputFile.maybe_do_quantize, - ndarrays_inner, - concurrency=concurrency, - max_workers=concurrency, + OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency, use_processpool_executor=True, ) else: ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner) start = time.time() - for i, ((name, lazy_tensor), ndarray) in enumerate( - zip(model.items(), ndarrays) - ): + for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)): elapsed = time.time() - start - size = " x ".join(f"{dim:6d}" for dim in lazy_tensor.shape) + size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) padi = len(str(len(model))) print( f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}" @@ -1358,7 +1273,7 @@ def load_some_model(path: Path) -> ModelPlus: class VocabFactory: def __init__(self, path: Path): self.path = path - self.files = { + self.files: dict[str, Path | None] = { "tokenizer.model": None, "vocab.json": None, "tokenizer.json": None, @@ -1373,27 +1288,20 @@ class VocabFactory: self.files[file] = file_path elif parent_file_path.exists(): self.files[file] = parent_file_path + print(f"Found vocab files: {self.files}") - def _select_file(self, vocabtype: Optional[str]) -> Path: + def _select_file(self, vocabtype: str | None) -> Path: if vocabtype in ["spm", "bpe"]: - # For SentencePiece and BPE, return specific files as before - file_key = "tokenizer.model" if vocabtype == "spm" else "vocab.json" - if self.files[file_key]: - return self.files[file_key] - else: - raise FileNotFoundError(f"{vocabtype} {file_key} not found.") - elif vocabtype == "hfft": + for file_key in self.files.keys(): + if (file := self.files[file_key]) is not None: + return file + raise FileNotFoundError(f"{vocabtype} vocab not found.") + if vocabtype == "hfft": # For Hugging Face Fast Tokenizer, return the directory path instead of a specific file return self.path - else: - raise ValueError(f"Unsupported vocabulary type {vocabtype}") + raise ValueError(f"Unsupported vocabulary type {vocabtype}") - def _create_special_vocab( - self, - vocab: Vocab, - vocabtype: str, - model_parent_path: Path, - ) -> gguf.SpecialVocab: + def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab: load_merges = vocabtype == "bpe" n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None return gguf.SpecialVocab( @@ -1403,13 +1311,12 @@ class VocabFactory: n_vocab=n_vocab, ) - def load_vocab( - self, vocabtype: str, model_parent_path: Path - ) -> Tuple[Vocab, gguf.SpecialVocab]: + def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]: path = self._select_file(vocabtype) print(f"Loading vocab file '{path}', type '{vocabtype}'") added_tokens_path = path.parent / "added_tokens.json" + vocab: Vocab if vocabtype == "bpe": vocab = BpeVocab( path, added_tokens_path if added_tokens_path.exists() else None @@ -1424,6 +1331,7 @@ class VocabFactory: ) else: raise ValueError(f"Unsupported vocabulary type {vocabtype}") + # FIXME: Respect --vocab-dir? special_vocab = self._create_special_vocab( vocab, vocabtype, @@ -1432,18 +1340,17 @@ class VocabFactory: return vocab, special_vocab -def default_output_file(model_paths: list[Path], file_type: GGMLFileType) -> Path: +def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path: namestr = { - GGMLFileType.AllF32: "f32", + GGMLFileType.AllF32: "f32", GGMLFileType.MostlyF16: "f16", - GGMLFileType.MostlyQ8_0: "q8_0", + GGMLFileType.MostlyQ8_0:"q8_0", }[file_type] ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf" if ret in model_paths: sys.stderr.write( f"Error: Default output path ({ret}) would overwrite the input. " - "Please explicitly specify a path using --outfile.\n" - ) + "Please explicitly specify a path using --outfile.\n") sys.exit(1) return ret @@ -1453,111 +1360,34 @@ def do_dump_model(model_plus: ModelPlus) -> None: print(f"model_plus.format = {model_plus.format!r}") print(f"model_plus.vocab = {model_plus.vocab!r}") for name, lazy_tensor in model_plus.model.items(): - print( - f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}" - ) + print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") -def get_argument_parser() -> ArgumentParser: +def main(args_in: list[str] | None = None) -> None: output_choices = ["f32", "f16"] if np.uint32(1) == np.uint32(1).newbyteorder("<"): # We currently only support Q8_0 output on little endian systems. output_choices.append("q8_0") + vocab_types = ["spm", "bpe", "hfft"] + parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file") + parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None) + parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") + parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") + parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") + parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") + parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") + parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm") + parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") + parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") + parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY) + parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine") + parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") - parser = argparse.ArgumentParser( - description="Convert a LLaMa model to a GGML compatible file" - ) - - parser.add_argument( - "model", - type=Path, - help="Directory containing the model file or the model file itself (*.pth, *.pt, *.bin)", - ) - - parser.add_argument( - "--awq-path", - type=Path, - help="Path to the Activation-aware Weight Quantization cache file", - default=None, - ) - - parser.add_argument( - "--dump", - action="store_true", - help="Display the model content without converting it", - ) - - parser.add_argument( - "--dump-single", - action="store_true", - help="Display the content of a single model file without conversion", - ) - - parser.add_argument( - "--vocab-only", - action="store_true", - help="Extract and output only the vocabulary", - ) - - parser.add_argument( - "--outtype", - choices=output_choices, - help="Output format - note: q8_0 may be very slow (default: f16 or f32 based on input)", - ) - - parser.add_argument( - "--vocab-dir", - type=Path, - help="Directory containing the tokenizer.model, if separate from the model file", - ) - - parser.add_argument( - "--vocab-type", - choices=["spm", "bpe", "hfft"], # hfft: Hugging Face Fast Tokenizer - default="spm", - help="The vocabulary format used to define the tokenizer model (default: spm)", - ) - - parser.add_argument( - "--pad-vocab", - action="store_true", - help="Add padding tokens when the model's vocabulary size exceeds the tokenizer metadata", - ) - - parser.add_argument( - "--outfile", - type=Path, - help="Specify the path for the output file (default is based on input)", - ) - - parser.add_argument( - "--ctx", type=int, help="Model training context (default is based on input)" - ) - - parser.add_argument( - "--concurrency", - type=int, - help=f"Concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", - default=DEFAULT_CONCURRENCY, - ) - - parser.add_argument( - "--big-endian", - action="store_true", - help="Indicate that the model is executed on a big-endian machine", - ) - - return parser - - -def main(argv: Optional[list[str]] = None) -> None: - parser = get_argument_parser() - args = parser.parse_args(argv) - + args = parser.parse_args(args_in) if args.awq_path: - sys.path.insert(1, str(Path(__file__).resolve().parent / "awq-py")) - from awq.apply_awq import add_scale_weights - + sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) + from awq.apply_awq import add_scale_weights # type: ignore[import-not-found] tmp_model_path = args.model / "weighted_model" if tmp_model_path.is_dir(): print(f"{tmp_model_path} exists as a weighted model.") @@ -1576,14 +1406,11 @@ def main(argv: Optional[list[str]] = None) -> None: if not args.vocab_only: model_plus = load_some_model(args.model) else: - model_plus = ModelPlus( - model={}, paths=[args.model / "dummy"], format="none", vocab=None - ) + model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None) if args.dump: do_dump_model(model_plus) return - endianess = gguf.GGUFEndian.LITTLE if args.big_endian: endianess = gguf.GGUFEndian.BIG @@ -1591,12 +1418,10 @@ def main(argv: Optional[list[str]] = None) -> None: params = Params.load(model_plus) if params.n_ctx == -1: if args.ctx is None: - raise Exception( - "The model doesn't have a context size, and you didn't specify one with --ctx\n" - "Please specify one with --ctx:\n" - " - LLaMA v1: --ctx 2048\n" - " - LLaMA v2: --ctx 4096\n" - ) + raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n" + "Please specify one with --ctx:\n" + " - LLaMA v1: --ctx 2048\n" + " - LLaMA v2: --ctx 4096\n") params.n_ctx = args.ctx if args.outtype: @@ -1617,42 +1442,30 @@ def main(argv: Optional[list[str]] = None) -> None: if not args.outfile: raise ValueError("need --outfile if using --vocab-only") outfile = args.outfile - OutputFile.write_vocab_only( - outfile, - params, - vocab, - special_vocab, - endianess=endianess, - pad_vocab=args.pad_vocab, - ) + OutputFile.write_vocab_only(outfile, params, vocab, special_vocab, + endianess=endianess, pad_vocab=args.pad_vocab) print(f"Wrote {outfile}") return if model_plus.vocab is not None and args.vocab_dir is None: vocab = model_plus.vocab - model = model_plus.model - model = convert_model_names(model, params) - ftype = pick_output_type(model, args.outtype) - model = convert_to_output_type(model, ftype) - outfile = args.outfile or default_output_file(model_plus.paths, ftype) + print(f"Vocab info: {vocab}") + print(f"Special vocab info: {special_vocab}") + + model = model_plus.model + model = convert_model_names(model, params) + ftype = pick_output_type(model, args.outtype) + model = convert_to_output_type(model, ftype) + outfile = args.outfile or default_outfile(model_plus.paths, ftype) params.ftype = ftype print(f"Writing {outfile}, format {ftype}") - OutputFile.write_all( - outfile, - ftype, - params, - model, - vocab, - special_vocab, - concurrency=args.concurrency, - endianess=endianess, - pad_vocab=args.pad_vocab, - ) + OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, + concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab) print(f"Wrote {outfile}") -if __name__ == "__main__": - main(sys.argv[1:]) # Exclude the first element (script name) from sys.argv +if __name__ == '__main__': + main() diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index eaca42fc1..11fcbf443 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -1138,9 +1138,8 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor return tn_buf.data(); }; - uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla' // write_magic - file.write_u32(LLAMA_FILE_MAGIC_LORA); // magic + file.write_u32(LLAMA_FILE_MAGIC_GGLA); // magic file.write_u32(1); // version // write_hparams file.write_u32(lora->hparams.lora_r); @@ -1800,7 +1799,7 @@ int main(int argc, char ** argv) { std::vector train_tokens; std::vector train_samples_begin; std::vector train_samples_size; - printf("%s: tokenize training data\n", __func__); + printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data); tokenize_file(lctx, params.common.fn_train_data, params.common.sample_start, diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md new file mode 100644 index 000000000..578e8fc27 --- /dev/null +++ b/examples/imatrix/README.md @@ -0,0 +1,32 @@ +# llama.cpp/examples/imatrix + +Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantum models. +More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861 + +## Usage + +``` +./imatrix -m -f [-o ] [--verbosity ] + [-ofreq num_chunks] [-ow <0 or 1>] [other common params] +``` + +Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory. +The parameters in square brackets are optional and have the following meaning: +* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used. +* `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`. +* `-ofreq` (or `--output-frequency`) specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks) +* `-ow` (or `--output-weight`) specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default. + +For faster computation, make sure to use GPU offloading via the `-ngl` argument + +## Example + +```bash +LLAMA_CUBLAS=1 make -j + +# generate importance matrix (imatrix.dat) +./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99 + +# use the imatrix to perform a Q4_K_M quantization +./quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m +``` diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 1461bc963..5687476cd 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -33,43 +33,120 @@ class IMatrixCollector { public: IMatrixCollector() = default; void set_parameters(StatParams&& params) { m_params = std::move(params); } - void collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1); + bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); void save_imatrix() const; private: std::unordered_map m_stats; StatParams m_params; std::mutex m_mutex; int m_last_call = 0; + std::vector m_src1_data; + std::vector m_ids; // the expert ids from ggml_mul_mat_id }; -void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) { - if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return; - if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return; +bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { + GGML_UNUSED(user_data); + + const struct ggml_tensor * src0 = t->src[0]; + const struct ggml_tensor * src1 = t->src[1]; + + // when ask is true, the scheduler wants to know if we are interested in data from this tensor + // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection + if (ask) { + if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications + if (t->op != GGML_OP_MUL_MAT) return false; + if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false; + if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false; + return true; + } + std::lock_guard lock(m_mutex); - auto& e = m_stats[src0->name]; - if (e.values.empty()) { - e.values.resize(src1->ne[0], 0); + + // copy the data from the GPU memory if needed + const bool is_host = ggml_backend_buffer_is_host(src1->buffer); + + if (!is_host) { + m_src1_data.resize(ggml_nelements(src1)); + ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1)); } - else if (e.values.size() != (size_t)src1->ne[0]) { - fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]); - exit(1); //GGML_ASSERT(false); - } - ++e.ncall; - if (m_params.verbosity > 1) { - printf("%s[%d]: %s, %d x %d, %d\n",__func__,m_last_call,src0->name,(int)src1->ne[0],(int)src1->ne[1],(int)src1->type); - } - for (int row = 0; row < (int)src1->ne[1]; ++row) { - const float * x = (const float *)src1->data + row * src1->ne[0]; - for (int j = 0; j < (int)src1->ne[0]; ++j) { - e.values[j] += x[j]*x[j]; - } - } - if (e.ncall > m_last_call) { - m_last_call = e.ncall; - if (m_last_call % m_params.n_output_frequency == 0) { - save_imatrix(); + + const float * data = is_host ? (const float *) src1->data : m_src1_data.data(); + + if (t->op == GGML_OP_MUL_MAT_ID) { + const int idx = ((int32_t *) t->op_params)[0]; + const int n_as = ((int32_t *) t->op_params)[1]; + + // the top-k selected expert ids are stored in the src0 tensor + // for simplicity, always copy src0 to host, because it is small + // take into account that src0 is not contiguous! + GGML_ASSERT(src0->ne[1] == src1->ne[1]); + GGML_ASSERT(n_as*ggml_nrows(src0)*sizeof(int) == GGML_PAD(ggml_nbytes(src0), n_as*sizeof(int))); + m_ids.resize(ggml_nbytes(src0)/sizeof(int)); + ggml_backend_tensor_get(src0, m_ids.data(), 0, ggml_nbytes(src0)); + + // loop over all possible experts, regardless if they are used or not in the batch + // this is necessary to guarantee equal number of "ncall" for each tensor + for (int ex = 0; ex < n_as; ++ex) { + src0 = t->src[2 + ex]; + auto& e = m_stats[src0->name]; + if (e.values.empty()) { + e.values.resize(src1->ne[0], 0); + } + else if (e.values.size() != (size_t)src1->ne[0]) { + fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]); + exit(1); //GGML_ASSERT(false); + } + // NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger + // using the following line, we can correct for that if needed + //if (idx == t->src[0]->ne[0] - 1) ++e.ncall; + ++e.ncall; + if (m_params.verbosity > 1) { + printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); + } + for (int row = 0; row < (int)src1->ne[1]; ++row) { + const int excur = m_ids[row*n_as + idx]; + GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check + if (excur != ex) continue; + const float * x = data + row * src1->ne[0]; + for (int j = 0; j < (int)src1->ne[0]; ++j) { + e.values[j] += x[j]*x[j]; + } + } + if (e.ncall > m_last_call) { + m_last_call = e.ncall; + if (m_last_call % m_params.n_output_frequency == 0) { + save_imatrix(); + } + } + } + } else { + auto& e = m_stats[src0->name]; + if (e.values.empty()) { + e.values.resize(src1->ne[0], 0); + } + else if (e.values.size() != (size_t)src1->ne[0]) { + fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]); + exit(1); //GGML_ASSERT(false); + } + ++e.ncall; + if (m_params.verbosity > 1) { + printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); + } + for (int row = 0; row < (int)src1->ne[1]; ++row) { + const float * x = data + row * src1->ne[0]; + for (int j = 0; j < (int)src1->ne[0]; ++j) { + e.values[j] += x[j]*x[j]; + } + } + if (e.ncall > m_last_call) { + m_last_call = e.ncall; + if (m_last_call % m_params.n_output_frequency == 0) { + save_imatrix(); + } } } + + return true; } void IMatrixCollector::save_imatrix() const { @@ -93,8 +170,8 @@ void IMatrixCollector::save_imatrix() const { static IMatrixCollector g_collector; -static void ik_collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) { - g_collector.collect_imatrix(src0, src1); +static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { + return g_collector.collect_imatrix(t, ask, user_data); } @@ -171,7 +248,7 @@ static void process_logits( } } -static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { +static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl) { const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); const int n_ctx = llama_n_ctx(ctx); @@ -192,10 +269,12 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { } std::vector logit_history; - logit_history.resize(tokens.size()); - std::vector prob_history; - prob_history.resize(tokens.size()); + + if (compute_ppl) { + logit_history.resize(tokens.size()); + prob_history.resize(tokens.size()); + } const int n_chunk_max = tokens.size() / n_ctx; @@ -211,12 +290,17 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { std::vector workers(std::thread::hardware_concurrency() - 1); + const int num_batches = (n_ctx + n_batch - 1) / n_batch; + + std::vector logits; + if (compute_ppl && num_batches > 1) { + logits.reserve((size_t)n_ctx * n_vocab); + } + for (int i = 0; i < n_chunk; ++i) { const int start = i * n_ctx; const int end = start + n_ctx; - const int num_batches = (n_ctx + n_batch - 1) / n_batch; - std::vector logits; const auto t_start = std::chrono::high_resolution_clock::now(); @@ -244,8 +328,10 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { // restore the original token in case it was set to BOS tokens[batch_start] = token_org; - const auto * batch_logits = llama_get_logits(ctx); - logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); + if (compute_ppl && num_batches > 1) { + const auto * batch_logits = llama_get_logits(ctx); + logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); + } } const auto t_end = std::chrono::high_resolution_clock::now(); @@ -261,25 +347,32 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); } - const int first = n_ctx/2; - process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, - workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); - count += n_ctx - first - 1; + if (compute_ppl) { + const int first = n_ctx/2; + const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); + process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, + workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); + count += n_ctx - first - 1; - printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); - fflush(stdout); + printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); + fflush(stdout); + + logits.clear(); + } } printf("\n"); - nll2 /= count; - nll /= count; - const double ppl = exp(nll); - nll2 -= nll * nll; - if (nll2 > 0) { - nll2 = sqrt(nll2/(count-1)); - printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); - } else { - printf("Unexpected negative standard deviation of log(prob)\n"); + if (compute_ppl) { + nll2 /= count; + nll /= count; + const double ppl = exp(nll); + nll2 -= nll * nll; + if (nll2 > 0) { + nll2 = sqrt(nll2/(count-1)); + printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); + } else { + printf("Unexpected negative standard deviation of log(prob)\n"); + } } return true; @@ -288,6 +381,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { int main(int argc, char ** argv) { StatParams sparams; + bool compute_ppl = true; std::vector args; args.push_back(argv[0]); int iarg = 1; @@ -304,12 +398,19 @@ int main(int argc, char ** argv) { } else if (arg == "--verbosity") { sparams.verbosity = std::stoi(argv[++iarg]); + } else if (arg == "--no-ppl") { + compute_ppl = false; } else { args.push_back(argv[iarg]); } } if (iarg < argc) { - args.push_back(argv[iarg]); + std::string arg{argv[iarg]}; + if (arg == "--no-ppl") { + compute_ppl = false; + } else { + args.push_back(argv[iarg]); + } } gpt_params params; @@ -320,8 +421,6 @@ int main(int argc, char ** argv) { g_collector.set_parameters(std::move(sparams)); - ggml_set_imatrix_collection(ik_collect_imatrix); - params.logits_all = true; params.n_batch = std::min(params.n_batch, params.n_ctx); @@ -340,16 +439,27 @@ int main(int argc, char ** argv) { llama_backend_init(params.numa); - llama_model * model; - llama_context * ctx; + llama_model_params mparams = llama_model_params_from_gpt_params(params); - // load the model and apply lora adapter, if any - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams); if (model == NULL) { fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; } + llama_context_params cparams = llama_context_params_from_gpt_params(params); + + // pass the callback to the backend scheduler + // it will be executed for each node during the graph computation + cparams.cb_eval = ik_collect_imatrix; + cparams.cb_eval_user_data = NULL; + + llama_context * ctx = llama_new_context_with_model(model, cparams); + if (ctx == NULL) { + fprintf(stderr, "%s: error: unable to create context\n", __func__); + return 1; + } + const int n_ctx_train = llama_n_ctx_train(model); if (params.n_ctx > n_ctx_train) { fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", @@ -362,7 +472,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s\n", get_system_info(params).c_str()); } - bool OK = compute_imatrix(ctx, params); + bool OK = compute_imatrix(ctx, params, compute_ppl); if (!OK) { return 1; } diff --git a/examples/llama.android/.gitignore b/examples/llama.android/.gitignore new file mode 100644 index 000000000..347e252ef --- /dev/null +++ b/examples/llama.android/.gitignore @@ -0,0 +1,33 @@ +# Gradle files +.gradle/ +build/ + +# Local configuration file (sdk path, etc) +local.properties + +# Log/OS Files +*.log + +# Android Studio generated files and folders +captures/ +.externalNativeBuild/ +.cxx/ +*.apk +output.json + +# IntelliJ +*.iml +.idea/ +misc.xml +deploymentTargetDropDown.xml +render.experimental.xml + +# Keystore files +*.jks +*.keystore + +# Google Services (e.g. APIs or Firebase) +google-services.json + +# Android Profiling +*.hprof diff --git a/examples/llama.android/README.md b/examples/llama.android/README.md new file mode 100644 index 000000000..e69de29bb diff --git a/examples/llama.android/app/.gitignore b/examples/llama.android/app/.gitignore new file mode 100644 index 000000000..796b96d1c --- /dev/null +++ b/examples/llama.android/app/.gitignore @@ -0,0 +1 @@ +/build diff --git a/examples/llama.android/app/build.gradle.kts b/examples/llama.android/app/build.gradle.kts new file mode 100644 index 000000000..7815a8025 --- /dev/null +++ b/examples/llama.android/app/build.gradle.kts @@ -0,0 +1,91 @@ +plugins { + id("com.android.application") + id("org.jetbrains.kotlin.android") +} + +android { + namespace = "com.example.llama" + compileSdk = 34 + + ndkVersion = "26.1.10909125" + + defaultConfig { + applicationId = "com.example.llama" + minSdk = 33 + targetSdk = 34 + versionCode = 1 + versionName = "1.0" + + testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner" + vectorDrawables { + useSupportLibrary = true + } + ndk { + // Workaround for https://github.com/llvm/llvm-project/issues/65820 + // affecting armeabi-v7a. Skip armeabi-v7a when invoked with + // -Pskip-armeabi-v7a (e.g., ./gradlew build -Pskip-armeabi-v7a). + if (project.hasProperty("skip-armeabi-v7a")) { + abiFilters += listOf("arm64-v8a", "x86_64", "x86") + } + } + externalNativeBuild { + cmake { + cppFlags += listOf() + arguments += listOf() + } + } + } + + buildTypes { + release { + isMinifyEnabled = false + proguardFiles( + getDefaultProguardFile("proguard-android-optimize.txt"), + "proguard-rules.pro" + ) + } + } + compileOptions { + sourceCompatibility = JavaVersion.VERSION_1_8 + targetCompatibility = JavaVersion.VERSION_1_8 + } + kotlinOptions { + jvmTarget = "1.8" + } + buildFeatures { + compose = true + } + composeOptions { + kotlinCompilerExtensionVersion = "1.5.1" + } + packaging { + resources { + excludes += "/META-INF/{AL2.0,LGPL2.1}" + } + } + externalNativeBuild { + cmake { + path = file("src/main/cpp/CMakeLists.txt") + version = "3.22.1" + } + } +} + +dependencies { + + implementation("androidx.core:core-ktx:1.12.0") + implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.2") + implementation("androidx.activity:activity-compose:1.8.2") + implementation(platform("androidx.compose:compose-bom:2023.08.00")) + implementation("androidx.compose.ui:ui") + implementation("androidx.compose.ui:ui-graphics") + implementation("androidx.compose.ui:ui-tooling-preview") + implementation("androidx.compose.material3:material3") + testImplementation("junit:junit:4.13.2") + androidTestImplementation("androidx.test.ext:junit:1.1.5") + androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1") + androidTestImplementation(platform("androidx.compose:compose-bom:2023.08.00")) + androidTestImplementation("androidx.compose.ui:ui-test-junit4") + debugImplementation("androidx.compose.ui:ui-tooling") + debugImplementation("androidx.compose.ui:ui-test-manifest") +} diff --git a/examples/llama.android/app/proguard-rules.pro b/examples/llama.android/app/proguard-rules.pro new file mode 100644 index 000000000..f1b424510 --- /dev/null +++ b/examples/llama.android/app/proguard-rules.pro @@ -0,0 +1,21 @@ +# Add project specific ProGuard rules here. +# You can control the set of applied configuration files using the +# proguardFiles setting in build.gradle. +# +# For more details, see +# http://developer.android.com/guide/developing/tools/proguard.html + +# If your project uses WebView with JS, uncomment the following +# and specify the fully qualified class name to the JavaScript interface +# class: +#-keepclassmembers class fqcn.of.javascript.interface.for.webview { +# public *; +#} + +# Uncomment this to preserve the line number information for +# debugging stack traces. +#-keepattributes SourceFile,LineNumberTable + +# If you keep the line number information, uncomment this to +# hide the original source file name. +#-renamesourcefileattribute SourceFile diff --git a/examples/llama.android/app/src/main/AndroidManifest.xml b/examples/llama.android/app/src/main/AndroidManifest.xml new file mode 100644 index 000000000..41a358a29 --- /dev/null +++ b/examples/llama.android/app/src/main/AndroidManifest.xml @@ -0,0 +1,30 @@ + + + + + + + + + + + + + + + + + diff --git a/examples/llama.android/app/src/main/cpp/CMakeLists.txt b/examples/llama.android/app/src/main/cpp/CMakeLists.txt new file mode 100644 index 000000000..85139329a --- /dev/null +++ b/examples/llama.android/app/src/main/cpp/CMakeLists.txt @@ -0,0 +1,50 @@ + +# For more information about using CMake with Android Studio, read the +# documentation: https://d.android.com/studio/projects/add-native-code.html. +# For more examples on how to use CMake, see https://github.com/android/ndk-samples. + +# Sets the minimum CMake version required for this project. +cmake_minimum_required(VERSION 3.22.1) + +# Declares the project name. The project name can be accessed via ${ PROJECT_NAME}, +# Since this is the top level CMakeLists.txt, the project name is also accessible +# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level +# build script scope). +project("llama-android") + +include(FetchContent) +FetchContent_Declare( + llama + GIT_REPOSITORY https://github.com/ggerganov/llama.cpp + GIT_TAG master +) + +# Also provides "common" +FetchContent_MakeAvailable(llama) + +# Creates and names a library, sets it as either STATIC +# or SHARED, and provides the relative paths to its source code. +# You can define multiple libraries, and CMake builds them for you. +# Gradle automatically packages shared libraries with your APK. +# +# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define +# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME} +# is preferred for the same purpose. +# +# In order to load a library into your app from Java/Kotlin, you must call +# System.loadLibrary() and pass the name of the library defined here; +# for GameActivity/NativeActivity derived applications, the same library name must be +# used in the AndroidManifest.xml file. +add_library(${CMAKE_PROJECT_NAME} SHARED + # List C/C++ source files with relative paths to this CMakeLists.txt. + llama-android.cpp) + +# Specifies libraries CMake should link to your target library. You +# can link libraries from various origins, such as libraries defined in this +# build script, prebuilt third-party libraries, or Android system libraries. +target_link_libraries(${CMAKE_PROJECT_NAME} + # List libraries link to the target library + llama + common + android + log) diff --git a/examples/llama.android/app/src/main/cpp/llama-android.cpp b/examples/llama.android/app/src/main/cpp/llama-android.cpp new file mode 100644 index 000000000..d5e705dce --- /dev/null +++ b/examples/llama.android/app/src/main/cpp/llama-android.cpp @@ -0,0 +1,394 @@ +#include +#include +#include +#include +#include +#include +#include "llama.h" +#include "common/common.h" + +// Write C++ code here. +// +// Do not forget to dynamically load the C++ library into your application. +// +// For instance, +// +// In MainActivity.java: +// static { +// System.loadLibrary("llama-android"); +// } +// +// Or, in MainActivity.kt: +// companion object { +// init { +// System.loadLibrary("llama-android") +// } +// } + +#define TAG "llama-android.cpp" +#define LOGi(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__) +#define LOGe(...) __android_log_print(ANDROID_LOG_ERROR, TAG, __VA_ARGS__) + +jclass la_int_var; +jmethodID la_int_var_value; +jmethodID la_int_var_inc; + +static void log_callback(ggml_log_level level, const char * fmt, void * data) { + if (level == GGML_LOG_LEVEL_ERROR) __android_log_print(ANDROID_LOG_ERROR, TAG, fmt, data); + else if (level == GGML_LOG_LEVEL_INFO) __android_log_print(ANDROID_LOG_INFO, TAG, fmt, data); + else if (level == GGML_LOG_LEVEL_WARN) __android_log_print(ANDROID_LOG_WARN, TAG, fmt, data); + else __android_log_print(ANDROID_LOG_DEFAULT, TAG, fmt, data); +} + +extern "C" +JNIEXPORT jlong JNICALL +Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) { + llama_model_params model_params = llama_model_default_params(); + + auto path_to_model = env->GetStringUTFChars(filename, 0); + LOGi("Loading model from %s", path_to_model); + + auto model = llama_load_model_from_file(path_to_model, model_params); + env->ReleaseStringUTFChars(filename, path_to_model); + + if (!model) { + LOGe("load_model() failed"); + env->ThrowNew(env->FindClass("java/lang/IllegalStateException"), "load_model() failed"); + return 0; + } + + return reinterpret_cast(model); +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) { + llama_free_model(reinterpret_cast(model)); +} + +extern "C" +JNIEXPORT jlong JNICALL +Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) { + auto model = reinterpret_cast(jmodel); + + if (!model) { + LOGe("new_context(): model cannot be null"); + env->ThrowNew(env->FindClass("java/lang/IllegalArgumentException"), "Model cannot be null"); + return 0; + } + + int n_threads = std::max(1, std::min(8, (int) sysconf(_SC_NPROCESSORS_ONLN) - 2)); + LOGi("Using %d threads", n_threads); + + llama_context_params ctx_params = llama_context_default_params(); + ctx_params.seed = 1234; + ctx_params.n_ctx = 2048; + ctx_params.n_threads = n_threads; + ctx_params.n_threads_batch = n_threads; + + llama_context * context = llama_new_context_with_model(model, ctx_params); + + if (!context) { + LOGe("llama_new_context_with_model() returned null)"); + env->ThrowNew(env->FindClass("java/lang/IllegalStateException"), + "llama_new_context_with_model() returned null)"); + return 0; + } + + return reinterpret_cast(context); +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) { + llama_free(reinterpret_cast(context)); +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) { + llama_backend_free(); +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) { + llama_log_set(log_callback, NULL); +} + +extern "C" +JNIEXPORT jstring JNICALL +Java_com_example_llama_Llm_bench_1model( + JNIEnv *env, + jobject, + jlong context_pointer, + jlong model_pointer, + jlong batch_pointer, + jint pp, + jint tg, + jint pl, + jint nr + ) { + auto pp_avg = 0.0; + auto tg_avg = 0.0; + auto pp_std = 0.0; + auto tg_std = 0.0; + + const auto context = reinterpret_cast(context_pointer); + const auto model = reinterpret_cast(model_pointer); + const auto batch = reinterpret_cast(batch_pointer); + + const int n_ctx = llama_n_ctx(context); + + LOGi("n_ctx = %d", n_ctx); + + int i, j; + int nri; + for (nri = 0; nri < nr; nri++) { + LOGi("Benchmark prompt processing (pp)"); + + llama_batch_clear(*batch); + + const int n_tokens = pp; + for (i = 0; i < n_tokens; i++) { + llama_batch_add(*batch, 0, i, { 0 }, false); + } + + batch->logits[batch->n_tokens - 1] = true; + llama_kv_cache_clear(context); + + const auto t_pp_start = ggml_time_us(); + if (llama_decode(context, *batch) != 0) { + LOGi("llama_decode() failed during prompt processing"); + } + const auto t_pp_end = ggml_time_us(); + + // bench text generation + + LOGi("Benchmark text generation (tg)"); + + llama_kv_cache_clear(context); + const auto t_tg_start = ggml_time_us(); + for (i = 0; i < tg; i++) { + + llama_batch_clear(*batch); + for (j = 0; j < pl; j++) { + llama_batch_add(*batch, 0, i, { j }, true); + } + + LOGi("llama_decode() text generation: %d", i); + if (llama_decode(context, *batch) != 0) { + LOGi("llama_decode() failed during text generation"); + } + } + + const auto t_tg_end = ggml_time_us(); + + llama_kv_cache_clear(context); + + const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0; + const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0; + + const auto speed_pp = double(pp) / t_pp; + const auto speed_tg = double(pl * tg) / t_tg; + + pp_avg += speed_pp; + tg_avg += speed_tg; + + pp_std += speed_pp * speed_pp; + tg_std += speed_tg * speed_tg; + + LOGi("pp %f t/s, tg %f t/s", speed_pp, speed_tg); + } + + pp_avg /= double(nr); + tg_avg /= double(nr); + + if (nr > 1) { + pp_std = sqrt(pp_std / double(nr - 1) - pp_avg * pp_avg * double(nr) / double(nr - 1)); + tg_std = sqrt(tg_std / double(nr - 1) - tg_avg * tg_avg * double(nr) / double(nr - 1)); + } else { + pp_std = 0; + tg_std = 0; + } + + char model_desc[128]; + llama_model_desc(model, model_desc, sizeof(model_desc)); + + const auto model_size = double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0; + const auto model_n_params = double(llama_model_n_params(model)) / 1e9; + + const auto backend = "(Android)"; // TODO: What should this be? + + std::stringstream result; + result << std::setprecision(2); + result << "| model | size | params | backend | test | t/s |\n"; + result << "| --- | --- | --- | --- | --- | --- |\n"; + result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | pp " << pp << " | " << pp_avg << " ± " << pp_std << " |\n"; + result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | tg " << tg << " | " << tg_avg << " ± " << tg_std << " |\n"; + + return env->NewStringUTF(result.str().c_str()); +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) { + llama_batch_free(*reinterpret_cast(batch_pointer)); +} + +extern "C" +JNIEXPORT jlong JNICALL +Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) { + + // Source: Copy of llama.cpp:llama_batch_init but heap-allocated. + + llama_batch *batch = new llama_batch { + 0, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + 0, + 0, + 0, + }; + + if (embd) { + batch->embd = (float *) malloc(sizeof(float) * n_tokens * embd); + } else { + batch->token = (llama_token *) malloc(sizeof(llama_token) * n_tokens); + } + + batch->pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens); + batch->n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens); + batch->seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens); + for (int i = 0; i < n_tokens; ++i) { + batch->seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max); + } + batch->logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens); + + return reinterpret_cast(batch); +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject, jboolean numa) { + llama_backend_init(numa); +} + +extern "C" +JNIEXPORT jstring JNICALL +Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) { + return env->NewStringUTF(llama_print_system_info()); +} + +extern "C" +JNIEXPORT jint JNICALL +Java_com_example_llama_Llm_completion_1init( + JNIEnv *env, + jobject, + jlong context_pointer, + jlong batch_pointer, + jstring jtext, + jint n_len + ) { + + const auto text = env->GetStringUTFChars(jtext, 0); + const auto context = reinterpret_cast(context_pointer); + const auto batch = reinterpret_cast(batch_pointer); + + const auto tokens_list = llama_tokenize(context, text, 1); + + auto n_ctx = llama_n_ctx(context); + auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size()); + + LOGi("n_len = %d, n_ctx = %d, n_kv_req = %d", n_len, n_ctx, n_kv_req); + + if (n_kv_req > n_ctx) { + LOGe("error: n_kv_req > n_ctx, the required KV cache size is not big enough"); + } + + for (auto id : tokens_list) { + LOGi("%s", llama_token_to_piece(context, id).c_str()); + } + + llama_batch_clear(*batch); + + // evaluate the initial prompt + for (auto i = 0; i < tokens_list.size(); i++) { + llama_batch_add(*batch, tokens_list[i], i, { 0 }, false); + } + + // llama_decode will output logits only for the last token of the prompt + batch->logits[batch->n_tokens - 1] = true; + + if (llama_decode(context, *batch) != 0) { + LOGe("llama_decode() failed"); + } + + env->ReleaseStringUTFChars(jtext, text); + + return batch->n_tokens; +} + +extern "C" +JNIEXPORT jstring JNICALL +Java_com_example_llama_Llm_completion_1loop( + JNIEnv * env, + jobject, + jlong context_pointer, + jlong batch_pointer, + jint n_len, + jobject intvar_ncur +) { + const auto context = reinterpret_cast(context_pointer); + const auto batch = reinterpret_cast(batch_pointer); + const auto model = llama_get_model(context); + + if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur); + if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I"); + if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V"); + + auto n_vocab = llama_n_vocab(model); + auto logits = llama_get_logits_ith(context, batch->n_tokens - 1); + + std::vector candidates; + candidates.reserve(n_vocab); + + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + + // sample the most likely token + const auto new_token_id = llama_sample_token_greedy(context, &candidates_p); + + const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value); + if (new_token_id == llama_token_eos(model) || n_cur == n_len) { + return env->NewStringUTF(""); + } + + auto new_token_chars = llama_token_to_piece(context, new_token_id); + LOGi("new_token_chars: `%s`", new_token_chars.c_str()); + auto new_token = env->NewStringUTF(new_token_chars.c_str()); + + llama_batch_clear(*batch); + llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true); + + env->CallVoidMethod(intvar_ncur, la_int_var_inc); + + if (llama_decode(context, *batch) != 0) { + LOGe("llama_decode() returned null"); + } + + return new_token; +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) { + llama_kv_cache_clear(reinterpret_cast(context)); +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt b/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt new file mode 100644 index 000000000..78c231ae5 --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt @@ -0,0 +1,119 @@ +package com.example.llama + +import android.app.DownloadManager +import android.net.Uri +import android.util.Log +import androidx.compose.material3.Button +import androidx.compose.material3.Text +import androidx.compose.runtime.Composable +import androidx.compose.runtime.getValue +import androidx.compose.runtime.mutableDoubleStateOf +import androidx.compose.runtime.mutableStateOf +import androidx.compose.runtime.remember +import androidx.compose.runtime.rememberCoroutineScope +import androidx.compose.runtime.setValue +import androidx.core.database.getLongOrNull +import androidx.core.net.toUri +import kotlinx.coroutines.delay +import kotlinx.coroutines.launch +import java.io.File + +data class Downloadable(val name: String, val source: Uri, val destination: File) { + companion object { + @JvmStatic + private val tag: String? = this::class.qualifiedName + + sealed interface State + data object Ready: State + data class Downloading(val id: Long): State + data class Downloaded(val downloadable: Downloadable): State + data class Error(val message: String): State + + @JvmStatic + @Composable + fun Button(viewModel: MainViewModel, dm: DownloadManager, item: Downloadable) { + var status: State by remember { + mutableStateOf( + if (item.destination.exists()) Downloaded(item) + else Ready + ) + } + var progress by remember { mutableDoubleStateOf(0.0) } + + val coroutineScope = rememberCoroutineScope() + + suspend fun waitForDownload(result: Downloading, item: Downloadable): State { + while (true) { + val cursor = dm.query(DownloadManager.Query().setFilterById(result.id)) + + if (cursor == null) { + Log.e(tag, "dm.query() returned null") + return Error("dm.query() returned null") + } + + if (!cursor.moveToFirst() || cursor.count < 1) { + cursor.close() + Log.i(tag, "cursor.moveToFirst() returned false or cursor.count < 1, download canceled?") + return Ready + } + + val pix = cursor.getColumnIndex(DownloadManager.COLUMN_BYTES_DOWNLOADED_SO_FAR) + val tix = cursor.getColumnIndex(DownloadManager.COLUMN_TOTAL_SIZE_BYTES) + val sofar = cursor.getLongOrNull(pix) ?: 0 + val total = cursor.getLongOrNull(tix) ?: 1 + cursor.close() + + if (sofar == total) { + return Downloaded(item) + } + + progress = (sofar * 1.0) / total + + delay(1000L) + } + } + + fun onClick() { + when (val s = status) { + is Downloaded -> { + viewModel.load(item.destination.path) + } + + is Downloading -> { + coroutineScope.launch { + status = waitForDownload(s, item) + } + } + + else -> { + item.destination.delete() + + val request = DownloadManager.Request(item.source).apply { + setTitle("Downloading model") + setDescription("Downloading model: ${item.name}") + setAllowedNetworkTypes(DownloadManager.Request.NETWORK_WIFI) + setDestinationUri(item.destination.toUri()) + } + + viewModel.log("Saving ${item.name} to ${item.destination.path}") + Log.i(tag, "Saving ${item.name} to ${item.destination.path}") + + val id = dm.enqueue(request) + status = Downloading(id) + onClick() + } + } + } + + Button(onClick = { onClick() }, enabled = status !is Downloading) { + when (status) { + is Downloading -> Text(text = "Downloading ${(progress * 100).toInt()}%") + is Downloaded -> Text("Load ${item.name}") + is Ready -> Text("Download ${item.name}") + is Error -> Text("Download ${item.name}") + } + } + } + + } +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt b/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt new file mode 100644 index 000000000..5f3270372 --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt @@ -0,0 +1,172 @@ +package com.example.llama + +import android.util.Log +import kotlinx.coroutines.CoroutineDispatcher +import kotlinx.coroutines.asCoroutineDispatcher +import kotlinx.coroutines.flow.Flow +import kotlinx.coroutines.flow.flow +import kotlinx.coroutines.flow.flowOn +import kotlinx.coroutines.withContext +import java.util.concurrent.Executors +import kotlin.concurrent.thread + +class Llm { + private val tag: String? = this::class.simpleName + + private val threadLocalState: ThreadLocal = ThreadLocal.withInitial { State.Idle } + + private val runLoop: CoroutineDispatcher = Executors.newSingleThreadExecutor { + thread(start = false, name = "Llm-RunLoop") { + Log.d(tag, "Dedicated thread for native code: ${Thread.currentThread().name}") + + // No-op if called more than once. + System.loadLibrary("llama-android") + + // Set llama log handler to Android + log_to_android() + backend_init(false) + + Log.d(tag, system_info()) + + it.run() + }.apply { + uncaughtExceptionHandler = Thread.UncaughtExceptionHandler { _, exception: Throwable -> + Log.e(tag, "Unhandled exception", exception) + } + } + }.asCoroutineDispatcher() + + private val nlen: Int = 64 + + private external fun log_to_android() + private external fun load_model(filename: String): Long + private external fun free_model(model: Long) + private external fun new_context(model: Long): Long + private external fun free_context(context: Long) + private external fun backend_init(numa: Boolean) + private external fun backend_free() + private external fun free_batch(batch: Long) + private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long + private external fun bench_model( + context: Long, + model: Long, + batch: Long, + pp: Int, + tg: Int, + pl: Int, + nr: Int + ): String + + private external fun system_info(): String + + private external fun completion_init( + context: Long, + batch: Long, + text: String, + nLen: Int + ): Int + + private external fun completion_loop( + context: Long, + batch: Long, + nLen: Int, + ncur: IntVar + ): String + + private external fun kv_cache_clear(context: Long) + + suspend fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1): String { + return withContext(runLoop) { + when (val state = threadLocalState.get()) { + is State.Loaded -> { + Log.d(tag, "bench(): $state") + bench_model(state.context, state.model, state.batch, pp, tg, pl, nr) + } + + else -> throw IllegalStateException("No model loaded") + } + } + } + + suspend fun load(pathToModel: String) { + withContext(runLoop) { + when (threadLocalState.get()) { + is State.Idle -> { + val model = load_model(pathToModel) + if (model == 0L) throw IllegalStateException("load_model() failed") + + val context = new_context(model) + if (context == 0L) throw IllegalStateException("new_context() failed") + + val batch = new_batch(512, 0, 1) + if (batch == 0L) throw IllegalStateException("new_batch() failed") + + Log.i(tag, "Loaded model $pathToModel") + threadLocalState.set(State.Loaded(model, context, batch)) + } + else -> throw IllegalStateException("Model already loaded") + } + } + } + + fun send(message: String): Flow = flow { + when (val state = threadLocalState.get()) { + is State.Loaded -> { + val ncur = IntVar(completion_init(state.context, state.batch, message, nlen)) + while (ncur.value <= nlen) { + val str = completion_loop(state.context, state.batch, nlen, ncur) + if (str.isEmpty()) { + break + } + emit(str) + } + kv_cache_clear(state.context) + } + else -> {} + } + }.flowOn(runLoop) + + /** + * Unloads the model and frees resources. + * + * This is a no-op if there's no model loaded. + */ + suspend fun unload() { + withContext(runLoop) { + when (val state = threadLocalState.get()) { + is State.Loaded -> { + free_context(state.context) + free_model(state.model) + free_batch(state.batch) + + threadLocalState.set(State.Idle) + } + else -> {} + } + } + } + + companion object { + private class IntVar(value: Int) { + @Volatile + var value: Int = value + private set + + fun inc() { + synchronized(this) { + value += 1 + } + } + } + + private sealed interface State { + data object Idle: State + data class Loaded(val model: Long, val context: Long, val batch: Long): State + } + + // Enforce only one instance of Llm. + private val _instance: Llm = Llm() + + fun instance(): Llm = _instance + } +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt new file mode 100644 index 000000000..9da04f7d3 --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt @@ -0,0 +1,154 @@ +package com.example.llama + +import android.app.ActivityManager +import android.app.DownloadManager +import android.content.ClipData +import android.content.ClipboardManager +import android.net.Uri +import android.os.Bundle +import android.os.StrictMode +import android.os.StrictMode.VmPolicy +import android.text.format.Formatter +import androidx.activity.ComponentActivity +import androidx.activity.compose.setContent +import androidx.activity.viewModels +import androidx.compose.foundation.layout.Box +import androidx.compose.foundation.layout.Column +import androidx.compose.foundation.layout.Row +import androidx.compose.foundation.layout.fillMaxSize +import androidx.compose.foundation.layout.padding +import androidx.compose.foundation.lazy.LazyColumn +import androidx.compose.foundation.lazy.items +import androidx.compose.foundation.lazy.rememberLazyListState +import androidx.compose.material3.Button +import androidx.compose.material3.LocalContentColor +import androidx.compose.material3.MaterialTheme +import androidx.compose.material3.OutlinedTextField +import androidx.compose.material3.Surface +import androidx.compose.material3.Text +import androidx.compose.runtime.Composable +import androidx.compose.ui.Modifier +import androidx.compose.ui.unit.dp +import androidx.core.content.getSystemService +import com.example.llama.ui.theme.LlamaAndroidTheme +import java.io.File + +class MainActivity( + activityManager: ActivityManager? = null, + downloadManager: DownloadManager? = null, + clipboardManager: ClipboardManager? = null, +): ComponentActivity() { + private val tag: String? = this::class.simpleName + + private val activityManager by lazy { activityManager ?: getSystemService()!! } + private val downloadManager by lazy { downloadManager ?: getSystemService()!! } + private val clipboardManager by lazy { clipboardManager ?: getSystemService()!! } + + private val viewModel: MainViewModel by viewModels() + + // Get a MemoryInfo object for the device's current memory status. + private fun availableMemory(): ActivityManager.MemoryInfo { + return ActivityManager.MemoryInfo().also { memoryInfo -> + activityManager.getMemoryInfo(memoryInfo) + } + } + + override fun onCreate(savedInstanceState: Bundle?) { + super.onCreate(savedInstanceState) + + StrictMode.setVmPolicy( + VmPolicy.Builder(StrictMode.getVmPolicy()) + .detectLeakedClosableObjects() + .build() + ) + + val free = Formatter.formatFileSize(this, availableMemory().availMem) + val total = Formatter.formatFileSize(this, availableMemory().totalMem) + + viewModel.log("Current memory: $free / $total") + viewModel.log("Downloads directory: ${getExternalFilesDir(null)}") + + val extFilesDir = getExternalFilesDir(null) + + val models = listOf( + Downloadable( + "Phi-2 7B (Q4_0, 1.6 GiB)", + Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"), + File(extFilesDir, "phi-2-q4_0.gguf"), + ), + Downloadable( + "TinyLlama 1.1B (f16, 2.2 GiB)", + Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"), + File(extFilesDir, "tinyllama-1.1-f16.gguf"), + ), + Downloadable( + "Phi 2 DPO (Q3_K_M, 1.48 GiB)", + Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"), + File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf") + ), + ) + + setContent { + LlamaAndroidTheme { + // A surface container using the 'background' color from the theme + Surface( + modifier = Modifier.fillMaxSize(), + color = MaterialTheme.colorScheme.background + ) { + MainCompose( + viewModel, + clipboardManager, + downloadManager, + models, + ) + } + + } + } + } +} + +@Composable +fun MainCompose( + viewModel: MainViewModel, + clipboard: ClipboardManager, + dm: DownloadManager, + models: List +) { + Column { + val scrollState = rememberLazyListState() + + Box(modifier = Modifier.weight(1f)) { + LazyColumn(state = scrollState) { + items(viewModel.messages) { + Text( + it, + style = MaterialTheme.typography.bodyLarge.copy(color = LocalContentColor.current), + modifier = Modifier.padding(16.dp) + ) + } + } + } + OutlinedTextField( + value = viewModel.message, + onValueChange = { viewModel.updateMessage(it) }, + label = { Text("Message") }, + ) + Row { + Button({ viewModel.send() }) { Text("Send") } + Button({ viewModel.bench(8, 4, 1) }) { Text("Bench") } + Button({ viewModel.clear() }) { Text("Clear") } + Button({ + viewModel.messages.joinToString("\n").let { + clipboard.setPrimaryClip(ClipData.newPlainText("", it)) + } + }) { Text("Copy") } + } + + Column { + for (model in models) { + Downloadable.Button(viewModel, dm, model) + } + } + } +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt new file mode 100644 index 000000000..be95e2221 --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt @@ -0,0 +1,104 @@ +package com.example.llama + +import android.util.Log +import androidx.compose.runtime.getValue +import androidx.compose.runtime.mutableStateOf +import androidx.compose.runtime.setValue +import androidx.lifecycle.ViewModel +import androidx.lifecycle.viewModelScope +import kotlinx.coroutines.flow.catch +import kotlinx.coroutines.launch + +class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() { + companion object { + @JvmStatic + private val NanosPerSecond = 1_000_000_000.0 + } + + private val tag: String? = this::class.simpleName + + var messages by mutableStateOf(listOf("Initializing...")) + private set + + var message by mutableStateOf("") + private set + + override fun onCleared() { + super.onCleared() + + viewModelScope.launch { + try { + llm.unload() + } catch (exc: IllegalStateException) { + messages += exc.message!! + } + } + } + + fun send() { + val text = message + message = "" + + // Add to messages console. + messages += text + messages += "" + + viewModelScope.launch { + llm.send(text) + .catch { + Log.e(tag, "send() failed", it) + messages += it.message!! + } + .collect { messages = messages.dropLast(1) + (messages.last() + it) } + } + } + + fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) { + viewModelScope.launch { + try { + val start = System.nanoTime() + val warmupResult = llm.bench(pp, tg, pl, nr) + val end = System.nanoTime() + + messages += warmupResult + + val warmup = (end - start).toDouble() / NanosPerSecond + messages += "Warm up time: $warmup seconds, please wait..." + + if (warmup > 5.0) { + messages += "Warm up took too long, aborting benchmark" + return@launch + } + + messages += llm.bench(512, 128, 1, 3) + } catch (exc: IllegalStateException) { + Log.e(tag, "bench() failed", exc) + messages += exc.message!! + } + } + } + + fun load(pathToModel: String) { + viewModelScope.launch { + try { + llm.load(pathToModel) + messages += "Loaded $pathToModel" + } catch (exc: IllegalStateException) { + Log.e(tag, "load() failed", exc) + messages += exc.message!! + } + } + } + + fun updateMessage(newMessage: String) { + message = newMessage + } + + fun clear() { + messages = listOf() + } + + fun log(message: String) { + messages += message + } +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt new file mode 100644 index 000000000..40c30e8d9 --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt @@ -0,0 +1,11 @@ +package com.example.llama.ui.theme + +import androidx.compose.ui.graphics.Color + +val Purple80 = Color(0xFFD0BCFF) +val PurpleGrey80 = Color(0xFFCCC2DC) +val Pink80 = Color(0xFFEFB8C8) + +val Purple40 = Color(0xFF6650a4) +val PurpleGrey40 = Color(0xFF625b71) +val Pink40 = Color(0xFF7D5260) diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt new file mode 100644 index 000000000..e742220a8 --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt @@ -0,0 +1,70 @@ +package com.example.llama.ui.theme + +import android.app.Activity +import android.os.Build +import androidx.compose.foundation.isSystemInDarkTheme +import androidx.compose.material3.MaterialTheme +import androidx.compose.material3.darkColorScheme +import androidx.compose.material3.dynamicDarkColorScheme +import androidx.compose.material3.dynamicLightColorScheme +import androidx.compose.material3.lightColorScheme +import androidx.compose.runtime.Composable +import androidx.compose.runtime.SideEffect +import androidx.compose.ui.graphics.toArgb +import androidx.compose.ui.platform.LocalContext +import androidx.compose.ui.platform.LocalView +import androidx.core.view.WindowCompat + +private val DarkColorScheme = darkColorScheme( + primary = Purple80, + secondary = PurpleGrey80, + tertiary = Pink80 +) + +private val LightColorScheme = lightColorScheme( + primary = Purple40, + secondary = PurpleGrey40, + tertiary = Pink40 + + /* Other default colors to override + background = Color(0xFFFFFBFE), + surface = Color(0xFFFFFBFE), + onPrimary = Color.White, + onSecondary = Color.White, + onTertiary = Color.White, + onBackground = Color(0xFF1C1B1F), + onSurface = Color(0xFF1C1B1F), + */ +) + +@Composable +fun LlamaAndroidTheme( + darkTheme: Boolean = isSystemInDarkTheme(), + // Dynamic color is available on Android 12+ + dynamicColor: Boolean = true, + content: @Composable () -> Unit +) { + val colorScheme = when { + dynamicColor && Build.VERSION.SDK_INT >= Build.VERSION_CODES.S -> { + val context = LocalContext.current + if (darkTheme) dynamicDarkColorScheme(context) else dynamicLightColorScheme(context) + } + + darkTheme -> DarkColorScheme + else -> LightColorScheme + } + val view = LocalView.current + if (!view.isInEditMode) { + SideEffect { + val window = (view.context as Activity).window + window.statusBarColor = colorScheme.primary.toArgb() + WindowCompat.getInsetsController(window, view).isAppearanceLightStatusBars = darkTheme + } + } + + MaterialTheme( + colorScheme = colorScheme, + typography = Typography, + content = content + ) +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt new file mode 100644 index 000000000..0b87946ca --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt @@ -0,0 +1,34 @@ +package com.example.llama.ui.theme + +import androidx.compose.material3.Typography +import androidx.compose.ui.text.TextStyle +import androidx.compose.ui.text.font.FontFamily +import androidx.compose.ui.text.font.FontWeight +import androidx.compose.ui.unit.sp + +// Set of Material typography styles to start with +val Typography = Typography( + bodyLarge = TextStyle( + fontFamily = FontFamily.Default, + fontWeight = FontWeight.Normal, + fontSize = 16.sp, + lineHeight = 24.sp, + letterSpacing = 0.5.sp + ) + /* Other default text styles to override + titleLarge = TextStyle( + fontFamily = FontFamily.Default, + fontWeight = FontWeight.Normal, + fontSize = 22.sp, + lineHeight = 28.sp, + letterSpacing = 0.sp + ), + labelSmall = TextStyle( + fontFamily = FontFamily.Default, + fontWeight = FontWeight.Medium, + fontSize = 11.sp, + lineHeight = 16.sp, + letterSpacing = 0.5.sp + ) + */ +) diff --git a/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml b/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml new file mode 100644 index 000000000..07d5da9cb --- /dev/null +++ b/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml @@ -0,0 +1,170 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml b/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml new file mode 100644 index 000000000..7706ab9e6 --- /dev/null +++ b/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml @@ -0,0 +1,30 @@ + + + + + + + + + + + diff --git a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml new file mode 100644 index 000000000..b3e26b4c6 --- /dev/null +++ b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml new file mode 100644 index 000000000..b3e26b4c6 --- /dev/null +++ b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp new file mode 100644 index 000000000..c209e78ec Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp new file mode 100644 index 000000000..b2dfe3d1b Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp new file mode 100644 index 000000000..4f0f1d64e Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp new file mode 100644 index 000000000..62b611da0 Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp new file mode 100644 index 000000000..948a3070f Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp new file mode 100644 index 000000000..1b9a6956b Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp new file mode 100644 index 000000000..28d4b77f9 Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp new file mode 100644 index 000000000..9287f5083 Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp new file mode 100644 index 000000000..aa7d6427e Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp new file mode 100644 index 000000000..9126ae37c Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp differ diff --git a/examples/llama.android/app/src/main/res/values/colors.xml b/examples/llama.android/app/src/main/res/values/colors.xml new file mode 100644 index 000000000..ca1931bca --- /dev/null +++ b/examples/llama.android/app/src/main/res/values/colors.xml @@ -0,0 +1,10 @@ + + + #FFBB86FC + #FF6200EE + #FF3700B3 + #FF03DAC5 + #FF018786 + #FF000000 + #FFFFFFFF + diff --git a/examples/llama.android/app/src/main/res/values/strings.xml b/examples/llama.android/app/src/main/res/values/strings.xml new file mode 100644 index 000000000..7a9d314e2 --- /dev/null +++ b/examples/llama.android/app/src/main/res/values/strings.xml @@ -0,0 +1,3 @@ + + LlamaAndroid + diff --git a/examples/llama.android/app/src/main/res/values/themes.xml b/examples/llama.android/app/src/main/res/values/themes.xml new file mode 100644 index 000000000..8a24fda56 --- /dev/null +++ b/examples/llama.android/app/src/main/res/values/themes.xml @@ -0,0 +1,5 @@ + + + +