This commit is contained in:
Lizonghang 2024-10-23 09:42:32 +04:00
parent 6374743747
commit 2a01ff5fb1
10 changed files with 4725 additions and 1026 deletions

View file

@ -262,6 +262,14 @@ MK_CFLAGS = -std=c11 -fPIC
MK_CXXFLAGS = -std=c++11 -fPIC
MK_NVCCFLAGS = -std=c++11
ifeq ($(UNAME_S),Darwin)
MK_CPPFLAGS += -I/opt/homebrew/include
MK_LDFLAGS += -L/opt/homebrew/lib -lzmq
else ifeq ($(UNAME_S),Linux)
MK_CPPFLAGS += -I/usr/local/include
MK_LDFLAGS += -L/usr/local/lib -lzmq
endif
ifdef LLAMA_NO_CCACHE
GGML_NO_CCACHE := 1
DEPRECATE_WARNING := 1

480
README.md
View file

@ -1,479 +1,3 @@
# llama.cpp
# prima.cpp
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
[![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
## Recent API changes
- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289)
- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291)
## Hot topics
- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669**
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
----
## Description
The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
variety of hardware - locally and in the cloud.
- Plain C/C++ implementation without any dependencies
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
- AVX, AVX2 and AVX512 support for x86 architectures
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
- Vulkan and SYCL backend support
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
improved significantly thanks to many contributions. It is the main playground for developing new features for the
[ggml](https://github.com/ggerganov/ggml) library.
**Supported models:**
Typically finetunes of the base models below are supported as well.
- [X] LLaMA 🦙
- [x] LLaMA 2 🦙🦙
- [x] LLaMA 3 🦙🦙🦙
- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423)
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
- [X] [StableLM models](https://huggingface.co/stabilityai)
- [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
- [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
- [x] [GPT-2](https://huggingface.co/gpt2)
- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
- [x] [InternLM2](https://huggingface.co/models?search=internlm2)
- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
- [x] [Gemma](https://ai.google.dev/gemma)
- [x] [Mamba](https://github.com/state-spaces/mamba)
- [x] [Grok-1](https://huggingface.co/keyfan/grok-1-hf)
- [x] [Xverse](https://huggingface.co/models?search=xverse)
- [x] [Command-R models](https://huggingface.co/models?search=CohereForAI/c4ai-command-r)
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
- [x] [OLMo](https://allenai.org/olmo)
- [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924)
- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
- [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520)
- [x] [Smaug](https://huggingface.co/models?search=Smaug)
- [x] [Poro 34B](https://huggingface.co/LumiOpen/Poro-34B)
- [x] [Bitnet b1.58 models](https://huggingface.co/1bitLLM)
- [x] [Flan T5](https://huggingface.co/models?search=flan-t5)
- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
**Multimodal models:**
- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
- [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
- [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
**Bindings:**
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
- JS/TS (Programmable Prompt Engine CLI): [offline-ai/cli](https://github.com/offline-ai/cli)
- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
- Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs)
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
**UI:**
Unless otherwise noted these projects are open-source with permissive licensing:
- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
- [iohub/collama](https://github.com/iohub/coLLaMA)
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
- [nat/openplayground](https://github.com/nat/openplayground)
- [Faraday](https://faraday.dev/) (proprietary)
- [LMStudio](https://lmstudio.ai/) (proprietary)
- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
- [ramalama](https://github.com/containers/ramalama) (MIT)
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
- [ollama/ollama](https://github.com/ollama/ollama)
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
- [RAGNA Desktop](https://ragna.app/) (proprietary)
- [RecurseChat](https://recurse.chat/) (proprietary)
- [semperai/amica](https://github.com/semperai/amica)
- [withcatai/catai](https://github.com/withcatai/catai)
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
- [Msty](https://msty.app) (proprietary)
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later)
- [Dot](https://github.com/alexpinel/Dot) (GPL)
- [MindMac](https://mindmac.app) (proprietary)
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
- [AIKit](https://github.com/sozercan/aikit) (MIT)
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
**Tools:**
- [akx/ggify](https://github.com/akx/ggify) download PyTorch models from HuggingFace Hub and convert them to GGML
- [akx/ollama-dl](https://github.com/akx/ollama-dl) download models from the Ollama library to be used directly with llama.cpp
- [crashr/gppm](https://github.com/crashr/gppm) launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)
**Infrastructure:**
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
**Games:**
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
## Demo
<details>
<summary>Typical run using LLaMA v2 13B on M2 Ultra</summary>
```
$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
I llama.cpp build info:
I UNAME_S: Darwin
I UNAME_P: arm
I UNAME_M: arm64
I CFLAGS: -I. -O3 -std=c11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -pthread -DGGML_USE_K_QUANTS -DGGML_USE_ACCELERATE
I CXXFLAGS: -I. -I./common -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS
I LDFLAGS: -framework Accelerate
I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
make: Nothing to be done for `default'.
main: build = 1041 (cf658ad)
main: seed = 1692823051
llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/llama-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
llama_model_loader: - type f32: 81 tensors
llama_model_loader: - type q4_0: 281 tensors
llama_model_loader: - type q6_K: 1 tensors
llm_load_print_meta: format = GGUF V1 (latest)
llm_load_print_meta: arch = llama
llm_load_print_meta: vocab type = SPM
llm_load_print_meta: n_vocab = 32000
llm_load_print_meta: n_merges = 0
llm_load_print_meta: n_ctx_train = 4096
llm_load_print_meta: n_ctx = 512
llm_load_print_meta: n_embd = 5120
llm_load_print_meta: n_head = 40
llm_load_print_meta: n_head_kv = 40
llm_load_print_meta: n_layer = 40
llm_load_print_meta: n_rot = 128
llm_load_print_meta: n_gqa = 1
llm_load_print_meta: f_norm_eps = 1.0e-05
llm_load_print_meta: f_norm_rms_eps = 1.0e-05
llm_load_print_meta: n_ff = 13824
llm_load_print_meta: freq_base = 10000.0
llm_load_print_meta: freq_scale = 1
llm_load_print_meta: model type = 13B
llm_load_print_meta: model ftype = mostly Q4_0
llm_load_print_meta: model size = 13.02 B
llm_load_print_meta: general.name = LLaMA v2
llm_load_print_meta: BOS token = 1 '<s>'
llm_load_print_meta: EOS token = 2 '</s>'
llm_load_print_meta: UNK token = 0 '<unk>'
llm_load_print_meta: LF token = 13 '<0x0A>'
llm_load_tensors: ggml ctx size = 0.11 MB
llm_load_tensors: mem required = 7024.01 MB (+ 400.00 MB per state)
...................................................................................................
llama_new_context_with_model: kv self size = 400.00 MB
llama_new_context_with_model: compute buffer total size = 75.41 MB
system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0
Building a website can be done in 10 simple steps:
Step 1: Find the right website platform.
Step 2: Choose your domain name and hosting plan.
Step 3: Design your website layout.
Step 4: Write your website content and add images.
Step 5: Install security features to protect your site from hackers or spammers
Step 6: Test your website on multiple browsers, mobile devices, operating systems etc…
Step 7: Test it again with people who are not related to you personally friends or family members will work just fine!
Step 8: Start marketing and promoting the website via social media channels or paid ads
Step 9: Analyze how many visitors have come to your site so far, what type of people visit more often than others (e.g., men vs women) etc…
Step 10: Continue to improve upon all aspects mentioned above by following trends in web design and staying up-to-date on new technologies that can enhance user experience even further!
How does a Website Work?
A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit whether its an image or text file (like PDFs). In order for someone elses browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable!
The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking.
How to
llama_print_timings: load time = 576.45 ms
llama_print_timings: sample time = 283.10 ms / 400 runs ( 0.71 ms per token, 1412.91 tokens per second)
llama_print_timings: prompt eval time = 599.83 ms / 19 tokens ( 31.57 ms per token, 31.68 tokens per second)
llama_print_timings: eval time = 24513.59 ms / 399 runs ( 61.44 ms per token, 16.28 tokens per second)
llama_print_timings: total time = 25431.49 ms
```
</details>
<details>
<summary>Demo of running both LLaMA-7B and whisper.cpp on a single M1 Pro MacBook</summary>
And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook:
https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8b4f-add84093ffff.mp4
</details>
## Usage
Here are the end-to-end binary build and model conversion steps for most supported models.
### Basic usage
Firstly, you need to get the binary. There are different methods that you can follow:
- Method 1: Clone this repository and build locally, see [how to build](./docs/build.md)
- Method 2: If you are using MacOS or Linux, you can install llama.cpp via [brew, flox or nix](./docs/install.md)
- Method 3: Use a Docker image, see [documentation for Docker](./docs/docker.md)
- Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/llama.cpp/releases)
You can run a basic completion using this command:
```bash
llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
# Output:
# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
```
See [this page](./examples/main/README.md) for a full list of parameters.
### Conversation mode
If you want a more ChatGPT-like experience, you can run in conversation mode by passing `-cnv` as a parameter:
```bash
llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
# Output:
# > hi, who are you?
# Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
#
# > what is 1+1?
# Easy peasy! The answer to 1+1 is... 2!
```
By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
```bash
./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
```
You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters:
```bash
./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
```
### Web server
[llama.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
Example usage:
```bash
./llama-server -m your_model.gguf --port 8080
# Basic web UI can be accessed via browser: http://localhost:8080
# Chat completion endpoint: http://localhost:8080/v1/chat/completions
```
### Interactive mode
> [!NOTE]
> If you prefer basic usage, please consider using conversation mode instead of interactive mode
In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
Here is an example of a few-shot interaction, invoked with the command
```bash
# default arguments using a 7B model
./examples/chat.sh
# advanced chat with a 13B model
./examples/chat-13B.sh
# custom arguments using a 13B model
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
```
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program.
![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)
### Persistent Interaction
The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
```bash
# Start a new chat
PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
# Resume that chat
PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
# Start a different chat with the same prompt/model
PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/another ./examples/chat-persistent.sh
# Different prompt cache for different prompt/model
PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
```
### Constrained output with grammars
`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
```bash
./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
```
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
## Build
Please refer to [Build llama.cpp locally](./docs/build.md)
## Supported backends
| Backend | Target devices |
| --- | --- |
| [Metal](./docs/build.md#metal-build) | Apple Silicon |
| [BLAS](./docs/build.md#blas-build) | All |
| [BLIS](./docs/backend/BLIS.md) | All |
| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
| [MUSA](./docs/build.md#musa) | Moore Threads GPU |
| [CUDA](./docs/build.md#cuda) | Nvidia GPU |
| [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
| [Vulkan](./docs/build.md#vulkan) | GPU |
| [CANN](./docs/build.md#cann) | Ascend NPU |
## Tools
### Prepare and Quantize
> [!NOTE]
> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face.
To learn more about quantizing model, [read this documentation](./examples/quantize/README.md)
### Perplexity (measuring model quality)
You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
To learn more how to measure perplexity using llama.cpp, [read this documentation](./examples/perplexity/README.md)
## Contributing
- Contributors can open PRs
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
- Collaborators will be invited based on contributions
- Any help with managing issues, PRs and projects is very appreciated!
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
- Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
## Other documentations
- [main (cli)](./examples/main/README.md)
- [server](./examples/server/README.md)
- [jeopardy](./examples/jeopardy/README.md)
- [GBNF grammars](./grammars/README.md)
**Development documentations**
- [How to build](./docs/build.md)
- [Running on Docker](./docs/docker.md)
- [Build on Android](./docs/android.md)
- [Performance troubleshooting](./docs/development/token_generation_performance_tips.md)
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
**Seminal papers and background on the models**
If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
- LLaMA:
- [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/)
- [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
- GPT-3
- [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
- GPT-3.5 / InstructGPT / ChatGPT:
- [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
This is a distributed implementation of [llama.cpp](https://github.com/ggerganov/llama.cpp), coming soon.

View file

@ -661,6 +661,41 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
params.n_ctx = value;
}
).set_env("LLAMA_ARG_CTX_SIZE"));
add_opt(llama_arg(
{"-w", "--world", "--world-size"}, "N",
format("number of devices to use (default: %d)", params.n_world),
[](gpt_params & params, int value) {
params.n_world = value;
}
).set_env("LLAMA_ARG_N_WORLD"));
add_opt(llama_arg(
{"-r", "--rank", "--my-rank"}, "N",
format("my rank for distributed inference (default: %d)", params.rank),
[](gpt_params & params, int value) {
params.rank = value;
}
).set_env("LLAMA_ARG_RANK"));
add_opt(llama_arg(
{"-lw", "--layer-window", "--n-layer-window"}, "N",
format("number of layers to process in each compute (default: %d)", params.n_layer_window),
[](gpt_params & params, int value) {
params.n_layer_window = value;
}
).set_env("LLAMA_ARG_N_LAYER_WINDOW"));
add_opt(llama_arg(
{"-mip", "--master", "--master-ip"}, "IPAddress",
format("ip address of the master node (default: %s)", params.master_ip.c_str()),
[](gpt_params & params, const std::string & value) {
params.master_ip = value;
}
).set_env("LLAMA_ARG_MASTER_IP"));
add_opt(llama_arg(
{"-nip", "--next", "--next-node", "--next-ip", "--next-node-ip"}, "IPAddress",
format("ip address of the next node (default: %s)", params.next_node_ip.c_str()),
[](gpt_params & params, const std::string & value) {
params.next_node_ip = value;
}
).set_env("LLAMA_ARG_NEXT_NODE_IP"));
add_opt(llama_arg(
{"-n", "--predict", "--n-predict"}, "N",
format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),

View file

@ -872,6 +872,8 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
return iparams;
}
llama_init_sockets(lctx, cparams.n_world, cparams.rank);
if (!params.control_vectors.empty()) {
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
@ -924,28 +926,34 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
if (params.warmup) {
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
const uint32_t my_rank = cparams.rank;
std::vector<llama_token> tmp;
llama_token bos = llama_token_bos(model);
llama_token eos = llama_token_eos(model);
// some models (e.g. T5) don't have a BOS token
if (bos != LLAMA_TOKEN_NULL) {
tmp.push_back(bos);
}
if (eos != LLAMA_TOKEN_NULL) {
tmp.push_back(eos);
}
if (tmp.empty()) {
tmp.push_back(0);
}
if (llama_model_has_encoder(model)) {
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
if (decoder_start_token_id == -1) {
decoder_start_token_id = bos;
if (my_rank == 0) {
llama_token bos = llama_token_bos(model);
llama_token eos = llama_token_eos(model);
// some models (e.g. T5) don't have a BOS token
if (bos != LLAMA_TOKEN_NULL) {
tmp.push_back(bos);
}
if (eos != LLAMA_TOKEN_NULL) {
tmp.push_back(eos);
}
if (tmp.empty()) {
tmp.push_back(0);
}
if (llama_model_has_encoder(model)) {
throw std::runtime_error("this model is currently not supported");
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
if (decoder_start_token_id == -1) {
decoder_start_token_id = bos;
}
tmp.clear();
tmp.push_back(decoder_start_token_id);
}
tmp.clear();
tmp.push_back(decoder_start_token_id);
}
if (llama_model_has_decoder(model)) {
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
@ -976,6 +984,9 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
if (params.n_gpu_layers != -1) {
mparams.n_gpu_layers = params.n_gpu_layers;
}
mparams.n_world = params.n_world;
mparams.rank = params.rank;
mparams.n_layer_window = params.n_layer_window;
mparams.rpc_servers = params.rpc_servers.c_str();
mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode;
@ -1025,6 +1036,22 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
auto cparams = llama_context_default_params();
cparams.n_world = params.n_world;
cparams.rank = params.rank;
cparams.n_layer_window = params.n_layer_window;
if (cparams.master_ip != nullptr) {
delete[] cparams.master_ip;
}
cparams.master_ip = new char[params.master_ip.length() + 1];
std::strcpy(cparams.master_ip, params.master_ip.c_str());
if (cparams.next_node_ip != nullptr) {
delete[] cparams.next_node_ip;
}
cparams.next_node_ip = new char[params.next_node_ip.length() + 1];
std::strcpy(cparams.next_node_ip, params.next_node_ip.c_str());
cparams.n_ctx = params.n_ctx;
cparams.n_seq_max = params.n_parallel;
cparams.n_batch = params.n_batch;

View file

@ -142,6 +142,11 @@ struct gpt_sampler_params {
};
struct gpt_params {
int32_t n_world = 1; // number of devices to use
int32_t rank = 0; // my rank for distributed inference
int32_t n_layer_window = 32; // number of layers to process in each compute
std::string master_ip = "localhost"; // ip address of the master node
std::string next_node_ip = "localhost"; // ip address of my next node
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 0; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@ -229,8 +234,7 @@ struct gpt_params {
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
bool kl_divergence = false; // compute KL divergence
bool kl_divergence = false; // compute KL divergence
bool usage = false; // print usage
bool use_color = false; // use color to distinguish generations and inputs
bool special = false; // enable special token output

View file

@ -14,6 +14,7 @@
#include <sstream>
#include <string>
#include <vector>
#include <thread>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
@ -141,6 +142,9 @@ int main(int argc, char ** argv) {
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
return 1;
}
const uint32_t n_world = params.n_world;
const uint32_t my_rank = params.rank;
GGML_ASSERT(!(n_world == 1 && my_rank > 0));
gpt_init();
@ -151,22 +155,6 @@ int main(int argc, char ** argv) {
console::init(params.simple_io, params.use_color);
atexit([]() { console::cleanup(); });
if (params.logits_all) {
LOG_ERR("************\n");
LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
LOG_ERR("************\n\n");
return 0;
}
if (params.embedding) {
LOG_ERR("************\n");
LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
LOG_ERR("************\n\n");
return 0;
}
if (params.n_ctx != 0 && params.n_ctx < 8) {
LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
@ -290,7 +278,7 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd_inp;
{
if (my_rank == 0) {
auto prompt = (params.conversation && params.enable_chat_template && !params.prompt.empty())
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
: params.prompt;
@ -304,23 +292,23 @@ int main(int argc, char ** argv) {
LOG_DBG("prompt: \"%s\"\n", prompt.c_str());
LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
}
// Should not run without any tokens
if (embd_inp.empty()) {
if (add_bos) {
embd_inp.push_back(llama_token_bos(model));
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
} else {
LOG_ERR("input is empty\n");
return -1;
// should not run without any tokens
if (embd_inp.empty()) {
if (add_bos) {
embd_inp.push_back(llama_token_bos(model));
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
} else {
LOG_ERR("input is empty\n");
return -1;
}
}
}
// Tokenize negative prompt
if ((int) embd_inp.size() > n_ctx - 4) {
LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1;
// tokenize negative prompt
if ((int) embd_inp.size() > n_ctx - 4) {
LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1;
}
}
// debug message about similarity of saved session, if applicable
@ -448,18 +436,18 @@ int main(int argc, char ** argv) {
}
}
smpl = gpt_sampler_init(model, sparams);
if (!smpl) {
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
return 1;
if (my_rank == 0) {
smpl = gpt_sampler_init(model, sparams);
if (!smpl) {
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
return 1;
}
LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl));
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
}
LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl));
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
// group-attention state
// number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
int ga_i = 0;
@ -487,9 +475,7 @@ int main(int argc, char ** argv) {
" - If you want to submit another line, end your input with '\\'.\n";
}
LOG_INF("== Running in interactive mode. ==\n");
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
LOG_INF( " - Press Ctrl+C to interject at any time.\n");
#endif
LOG_INF( " - Enter quit or exit to quit chat.\n");
LOG_INF( "%s\n", control_message);
is_interacting = params.interactive_first;
@ -525,6 +511,8 @@ int main(int argc, char ** argv) {
}
if (llama_model_has_encoder(model)) {
throw std::runtime_error("this model is currently not supported");
int enc_input_size = embd_inp.size();
llama_token * enc_input_buf = embd_inp.data();
@ -542,9 +530,16 @@ int main(int argc, char ** argv) {
embd_inp.push_back(decoder_start_token_id);
}
char * stop_signal = nullptr;
std::thread signal_thread;
if (my_rank != 0) {
signal_thread = std::thread(llama_free_sockets, ctx, &stop_signal);
}
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict
if (!embd.empty()) {
if (!embd.empty() || my_rank != 0) {
// Note: (n_ctx - 4) here is to match the logic for commandline prompt handling via
// --prompt or --file which uses the same value.
int max_embd_size = n_ctx - 4;
@ -640,25 +635,22 @@ int main(int argc, char ** argv) {
}
}
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
int n_eval = (int) embd.size() - i;
if (n_eval > params.n_batch) {
n_eval = params.n_batch;
if (my_rank == 0) {
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
int n_eval = (int) embd.size() - i;
if (n_eval > params.n_batch) {
n_eval = params.n_batch;
}
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0)) != 0) {
LOG_ERR("%s : failed to eval\n", __func__);
return 1;
}
n_past += n_eval;
}
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
LOG_ERR("%s : failed to eval\n", __func__);
return 1;
}
n_past += n_eval;
LOG_DBG("n_past = %d\n", n_past);
// Display total tokens alongside total time
if (params.n_print > 0 && n_past % params.n_print == 0) {
LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
} else {
llama_decode(ctx, llama_batch_get_one(embd.data(), 0, 0, 0));
if (stop_signal != nullptr && std::strcmp(stop_signal, "STOP") == 0) {
break;
}
}
@ -670,70 +662,70 @@ int main(int argc, char ** argv) {
embd.clear();
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
// optionally save the session on first sample (for faster prompt loading next time)
if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
need_to_save_session = false;
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
if (my_rank == 0) {
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
// optionally save the session on first sample (for faster prompt loading next time)
if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
need_to_save_session = false;
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
LOG_DBG("saved session to %s\n", path_session.c_str());
}
LOG_DBG("saved session to %s\n", path_session.c_str());
}
const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
gpt_sampler_accept(smpl, id, /* accept_grammar= */ true);
gpt_sampler_accept(smpl, id, /* accept_grammar= */ true);
embd.push_back(id);
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
// echo this to console
input_echo = true;
embd.push_back(id);
// decrement remaining sampling budget
--n_remain;
// echo this to console
input_echo = true;
LOG_DBG("n_remain: %d\n", n_remain);
} else {
// some user input remains from prompt or interaction, forward it to processing
while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]);
// decrement remaining sampling budget
--n_remain;
// push the prompt in the sampling context in order to apply repetition penalties later
// for the prompt, we don't apply grammar rules
gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
LOG_DBG("n_remain: %d\n", n_remain);
} else {
// some user input remains from prompt or interaction, forward it to processing
LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]);
// push the prompt in the sampling context in order to apply repetition penalties later
// for the prompt, we don't apply grammar rules
gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
++n_consumed;
if ((int) embd.size() >= params.n_batch) {
break;
++n_consumed;
if ((int) embd.size() >= params.n_batch) {
break;
}
}
}
}
// display text
if (input_echo && display) {
for (auto id : embd) {
const std::string token_str = llama_token_to_piece(ctx, id, params.special);
if (my_rank == 0) {
if (input_echo && display) {
for (auto id : embd) {
const std::string token_str = llama_token_to_piece(ctx, id, params.special);
// Console/Stream Output
LOG("%s", token_str.c_str());
// Console/Stream Output
LOG("%s", token_str.c_str());
// Record Displayed Tokens To Log
// Note: Generated tokens are created one by one hence this check
if (embd.size() > 1) {
// Incoming Requested Tokens
input_tokens.push_back(id);
} else {
// Outgoing Generated Tokens
output_tokens.push_back(id);
output_ss << token_str;
// Record Displayed Tokens To Log
// Note: Generated tokens are created one by one hence this check
if (embd.size() > 1) {
// Incoming Requested Tokens
input_tokens.push_back(id);
} else {
// Outgoing Generated Tokens
output_tokens.push_back(id);
output_ss << token_str;
}
}
}
}
// reset color to default if there is no pending user input
if (input_echo && (int) embd_inp.size() == n_consumed) {
if (my_rank == 0 && input_echo && (int) embd_inp.size() == n_consumed) {
console::set_display(console::reset);
display = true;
}
@ -782,7 +774,7 @@ int main(int argc, char ** argv) {
}
// deal with end of generation tokens in interactive mode
if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
if (my_rank == 0 && llama_token_is_eog(model, gpt_sampler_last(smpl))) {
LOG_DBG("found an EOG token\n");
if (params.interactive) {
@ -840,6 +832,10 @@ int main(int argc, char ** argv) {
console::set_display(console::reset);
display = true;
if (buffer == "quit\n" || buffer == "exit\n") {
break;
}
// Add tokens to embd only if the input buffer is non-empty
// Entering a empty line lets the user pass control back
if (buffer.length() > 1) {
@ -924,19 +920,20 @@ int main(int argc, char ** argv) {
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
}
LOG("\n\n");
gpt_perf_print(ctx, smpl);
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
gpt_sampler_free(smpl);
if (my_rank == 0) {
LOG("\n\n");
gpt_perf_print(ctx, smpl);
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
gpt_sampler_free(smpl);
llama_free_sockets(ctx, &stop_signal);
}
if (my_rank != 0 && signal_thread.joinable()) {
signal_thread.join();
}
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
ggml_threadpool_free(threadpool);
ggml_threadpool_free(threadpool_batch);
return 0;
}
}

View file

@ -276,6 +276,9 @@ extern "C" {
};
struct llama_model_params {
uint32_t n_world; // number of nodes
uint32_t rank; // my node rank
uint32_t n_layer_window; // number of layers to kept each time
int32_t n_gpu_layers; // number of layers to store in VRAM
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
@ -312,12 +315,17 @@ extern "C" {
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
// https://github.com/ggerganov/llama.cpp/pull/7544
struct llama_context_params {
uint32_t n_ctx; // text context, 0 = from model
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
uint32_t n_ubatch; // physical maximum batch size
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
int32_t n_threads; // number of threads to use for generation
int32_t n_threads_batch; // number of threads to use for batch processing
uint32_t n_world; // world size
uint32_t rank; // my rank
uint32_t n_layer_window; // number of layers to process in each compute
char * master_ip; // ip address of the master node
char * next_node_ip; // ip address of the next node
uint32_t n_ctx; // text context, 0 = from model
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
uint32_t n_ubatch; // physical maximum batch size
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
int32_t n_threads; // number of threads to use for generation
int32_t n_threads_batch; // number of threads to use for batch processing
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
@ -418,6 +426,9 @@ extern "C" {
LLAMA_API void llama_free_model(struct llama_model * model);
LLAMA_API void llama_init_sockets(struct llama_context * ctx, uint32_t n_world, uint32_t my_rank);
LLAMA_API void llama_free_sockets(struct llama_context * ctx, char ** msg);
// TODO: rename to llama_init_from_model
LLAMA_API struct llama_context * llama_new_context_with_model(
struct llama_model * model,

2796
include/zmq.hpp Normal file

File diff suppressed because it is too large Load diff

848
include/zmq_addon.hpp Normal file
View file

@ -0,0 +1,848 @@
/*
Copyright (c) 2016-2017 ZeroMQ community
Copyright (c) 2016 VOCA AS / Harald Nøkland
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
deal in the Software without restriction, including without limitation the
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
IN THE SOFTWARE.
*/
#ifndef __ZMQ_ADDON_HPP_INCLUDED__
#define __ZMQ_ADDON_HPP_INCLUDED__
#include "zmq.hpp"
#include <deque>
#include <iomanip>
#include <sstream>
#include <stdexcept>
#ifdef ZMQ_CPP11
#include <limits>
#include <functional>
#include <unordered_map>
namespace zmq
{
// socket ref or native file descriptor for poller
class poller_ref_t
{
public:
enum RefType
{
RT_SOCKET,
RT_FD
};
poller_ref_t() : poller_ref_t(socket_ref{})
{}
poller_ref_t(const zmq::socket_ref& socket) : data{RT_SOCKET, socket, {}}
{}
poller_ref_t(zmq::fd_t fd) : data{RT_FD, {}, fd}
{}
size_t hash() const ZMQ_NOTHROW
{
std::size_t h = 0;
hash_combine(h, std::get<0>(data));
hash_combine(h, std::get<1>(data));
hash_combine(h, std::get<2>(data));
return h;
}
bool operator == (const poller_ref_t& o) const ZMQ_NOTHROW
{
return data == o.data;
}
private:
template <class T>
static void hash_combine(std::size_t& seed, const T& v) ZMQ_NOTHROW
{
std::hash<T> hasher;
seed ^= hasher(v) + 0x9e3779b9 + (seed<<6) + (seed>>2);
}
std::tuple<int, zmq::socket_ref, zmq::fd_t> data;
}; // class poller_ref_t
} // namespace zmq
// std::hash<> specialization for std::unordered_map
template <> struct std::hash<zmq::poller_ref_t>
{
size_t operator()(const zmq::poller_ref_t& ref) const ZMQ_NOTHROW
{
return ref.hash();
}
};
#endif // ZMQ_CPP11
namespace zmq
{
#ifdef ZMQ_CPP11
namespace detail
{
template<bool CheckN, class OutputIt>
recv_result_t
recv_multipart_n(socket_ref s, OutputIt out, size_t n, recv_flags flags)
{
size_t msg_count = 0;
message_t msg;
while (true) {
if ZMQ_CONSTEXPR_IF (CheckN) {
if (msg_count >= n)
throw std::runtime_error(
"Too many message parts in recv_multipart_n");
}
if (!s.recv(msg, flags)) {
// zmq ensures atomic delivery of messages
assert(msg_count == 0);
return {};
}
++msg_count;
const bool more = msg.more();
*out++ = std::move(msg);
if (!more)
break;
}
return msg_count;
}
inline bool is_little_endian()
{
const uint16_t i = 0x01;
return *reinterpret_cast<const uint8_t *>(&i) == 0x01;
}
inline void write_network_order(unsigned char *buf, const uint32_t value)
{
if (is_little_endian()) {
ZMQ_CONSTEXPR_VAR uint32_t mask = (std::numeric_limits<std::uint8_t>::max)();
*buf++ = static_cast<unsigned char>((value >> 24) & mask);
*buf++ = static_cast<unsigned char>((value >> 16) & mask);
*buf++ = static_cast<unsigned char>((value >> 8) & mask);
*buf++ = static_cast<unsigned char>(value & mask);
} else {
std::memcpy(buf, &value, sizeof(value));
}
}
inline uint32_t read_u32_network_order(const unsigned char *buf)
{
if (is_little_endian()) {
return (static_cast<uint32_t>(buf[0]) << 24)
+ (static_cast<uint32_t>(buf[1]) << 16)
+ (static_cast<uint32_t>(buf[2]) << 8)
+ static_cast<uint32_t>(buf[3]);
} else {
uint32_t value;
std::memcpy(&value, buf, sizeof(value));
return value;
}
}
} // namespace detail
/* Receive a multipart message.
Writes the zmq::message_t objects to OutputIterator out.
The out iterator must handle an unspecified number of writes,
e.g. by using std::back_inserter.
Returns: the number of messages received or nullopt (on EAGAIN).
Throws: if recv throws. Any exceptions thrown
by the out iterator will be propagated and the message
may have been only partially received with pending
message parts. It is adviced to close this socket in that event.
*/
template<class OutputIt>
ZMQ_NODISCARD recv_result_t recv_multipart(socket_ref s,
OutputIt out,
recv_flags flags = recv_flags::none)
{
return detail::recv_multipart_n<false>(s, std::move(out), 0, flags);
}
/* Receive a multipart message.
Writes at most n zmq::message_t objects to OutputIterator out.
If the number of message parts of the incoming message exceeds n
then an exception will be thrown.
Returns: the number of messages received or nullopt (on EAGAIN).
Throws: if recv throws. Throws std::runtime_error if the number
of message parts exceeds n (exactly n messages will have been written
to out). Any exceptions thrown
by the out iterator will be propagated and the message
may have been only partially received with pending
message parts. It is adviced to close this socket in that event.
*/
template<class OutputIt>
ZMQ_NODISCARD recv_result_t recv_multipart_n(socket_ref s,
OutputIt out,
size_t n,
recv_flags flags = recv_flags::none)
{
return detail::recv_multipart_n<true>(s, std::move(out), n, flags);
}
/* Send a multipart message.
The range must be a ForwardRange of zmq::message_t,
zmq::const_buffer or zmq::mutable_buffer.
The flags may be zmq::send_flags::sndmore if there are
more message parts to be sent after the call to this function.
Returns: the number of messages sent (exactly msgs.size()) or nullopt (on EAGAIN).
Throws: if send throws. Any exceptions thrown
by the msgs range will be propagated and the message
may have been only partially sent. It is adviced to close this socket in that event.
*/
template<class Range
#ifndef ZMQ_CPP11_PARTIAL
,
typename = typename std::enable_if<
detail::is_range<Range>::value
&& (std::is_same<detail::range_value_t<Range>, message_t>::value
|| detail::is_buffer<detail::range_value_t<Range>>::value)>::type
#endif
>
send_result_t
send_multipart(socket_ref s, Range &&msgs, send_flags flags = send_flags::none)
{
using std::begin;
using std::end;
auto it = begin(msgs);
const auto end_it = end(msgs);
size_t msg_count = 0;
while (it != end_it) {
const auto next = std::next(it);
const auto msg_flags =
flags | (next == end_it ? send_flags::none : send_flags::sndmore);
if (!s.send(*it, msg_flags)) {
// zmq ensures atomic delivery of messages
assert(it == begin(msgs));
return {};
}
++msg_count;
it = next;
}
return msg_count;
}
/* Encode a multipart message.
The range must be a ForwardRange of zmq::message_t. A
zmq::multipart_t or STL container may be passed for encoding.
Returns: a zmq::message_t holding the encoded multipart data.
Throws: std::range_error is thrown if the size of any single part
can not fit in an unsigned 32 bit integer.
The encoding is compatible with that used by the CZMQ function
zmsg_encode(), see https://rfc.zeromq.org/spec/50/.
Each part consists of a size followed by the data.
These are placed contiguously into the output message. A part of
size less than 255 bytes will have a single byte size value.
Larger parts will have a five byte size value with the first byte
set to 0xFF and the remaining four bytes holding the size of the
part's data.
*/
template<class Range
#ifndef ZMQ_CPP11_PARTIAL
,
typename = typename std::enable_if<
detail::is_range<Range>::value
&& (std::is_same<detail::range_value_t<Range>, message_t>::value
|| detail::is_buffer<detail::range_value_t<Range>>::value)>::type
#endif
>
message_t encode(const Range &parts)
{
size_t mmsg_size = 0;
// First pass check sizes
for (const auto &part : parts) {
const size_t part_size = part.size();
if (part_size > (std::numeric_limits<std::uint32_t>::max)()) {
// Size value must fit into uint32_t.
throw std::range_error("Invalid size, message part too large");
}
const size_t count_size =
part_size < (std::numeric_limits<std::uint8_t>::max)() ? 1 : 5;
mmsg_size += part_size + count_size;
}
message_t encoded(mmsg_size);
unsigned char *buf = encoded.data<unsigned char>();
for (const auto &part : parts) {
const uint32_t part_size = static_cast<uint32_t>(part.size());
const unsigned char *part_data =
static_cast<const unsigned char *>(part.data());
if (part_size < (std::numeric_limits<std::uint8_t>::max)()) {
// small part
*buf++ = (unsigned char) part_size;
} else {
// big part
*buf++ = (std::numeric_limits<uint8_t>::max)();
detail::write_network_order(buf, part_size);
buf += sizeof(part_size);
}
std::memcpy(buf, part_data, part_size);
buf += part_size;
}
assert(static_cast<size_t>(buf - encoded.data<unsigned char>()) == mmsg_size);
return encoded;
}
/* Decode an encoded message to multiple parts.
The given output iterator must be a ForwardIterator to a container
holding zmq::message_t such as a zmq::multipart_t or various STL
containers.
Returns the ForwardIterator advanced once past the last decoded
part.
Throws: a std::out_of_range is thrown if the encoded part sizes
lead to exceeding the message data bounds.
The decoding assumes the message is encoded in the manner
performed by zmq::encode(), see https://rfc.zeromq.org/spec/50/.
*/
template<class OutputIt> OutputIt decode(const message_t &encoded, OutputIt out)
{
const unsigned char *source = encoded.data<unsigned char>();
const unsigned char *const limit = source + encoded.size();
while (source < limit) {
size_t part_size = *source++;
if (part_size == (std::numeric_limits<std::uint8_t>::max)()) {
if (static_cast<size_t>(limit - source) < sizeof(uint32_t)) {
throw std::out_of_range(
"Malformed encoding, overflow in reading size");
}
part_size = detail::read_u32_network_order(source);
// the part size is allowed to be less than 0xFF
source += sizeof(uint32_t);
}
if (static_cast<size_t>(limit - source) < part_size) {
throw std::out_of_range("Malformed encoding, overflow in reading part");
}
*out = message_t(source, part_size);
++out;
source += part_size;
}
assert(source == limit);
return out;
}
#endif
#ifdef ZMQ_HAS_RVALUE_REFS
/*
This class handles multipart messaging. It is the C++ equivalent of zmsg.h,
which is part of CZMQ (the high-level C binding). Furthermore, it is a major
improvement compared to zmsg.hpp, which is part of the examples in the ØMQ
Guide. Unnecessary copying is avoided by using move semantics to efficiently
add/remove parts.
*/
class multipart_t
{
private:
std::deque<message_t> m_parts;
public:
typedef std::deque<message_t>::value_type value_type;
typedef std::deque<message_t>::iterator iterator;
typedef std::deque<message_t>::const_iterator const_iterator;
typedef std::deque<message_t>::reverse_iterator reverse_iterator;
typedef std::deque<message_t>::const_reverse_iterator const_reverse_iterator;
// Default constructor
multipart_t() {}
// Construct from socket receive
multipart_t(socket_ref socket) { recv(socket); }
// Construct from memory block
multipart_t(const void *src, size_t size) { addmem(src, size); }
// Construct from string
multipart_t(const std::string &string) { addstr(string); }
// Construct from message part
multipart_t(message_t &&message) { add(std::move(message)); }
// Move constructor
multipart_t(multipart_t &&other) ZMQ_NOTHROW { m_parts = std::move(other.m_parts); }
// Move assignment operator
multipart_t &operator=(multipart_t &&other) ZMQ_NOTHROW
{
m_parts = std::move(other.m_parts);
return *this;
}
// Destructor
virtual ~multipart_t() { clear(); }
message_t &operator[](size_t n) { return m_parts[n]; }
const message_t &operator[](size_t n) const { return m_parts[n]; }
message_t &at(size_t n) { return m_parts.at(n); }
const message_t &at(size_t n) const { return m_parts.at(n); }
iterator begin() { return m_parts.begin(); }
const_iterator begin() const { return m_parts.begin(); }
const_iterator cbegin() const { return m_parts.cbegin(); }
reverse_iterator rbegin() { return m_parts.rbegin(); }
const_reverse_iterator rbegin() const { return m_parts.rbegin(); }
iterator end() { return m_parts.end(); }
const_iterator end() const { return m_parts.end(); }
const_iterator cend() const { return m_parts.cend(); }
reverse_iterator rend() { return m_parts.rend(); }
const_reverse_iterator rend() const { return m_parts.rend(); }
// Delete all parts
void clear() { m_parts.clear(); }
// Get number of parts
size_t size() const { return m_parts.size(); }
// Check if number of parts is zero
bool empty() const { return m_parts.empty(); }
// Receive multipart message from socket
bool recv(socket_ref socket, int flags = 0)
{
clear();
bool more = true;
while (more) {
message_t message;
#ifdef ZMQ_CPP11
if (!socket.recv(message, static_cast<recv_flags>(flags)))
return false;
#else
if (!socket.recv(&message, flags))
return false;
#endif
more = message.more();
add(std::move(message));
}
return true;
}
// Send multipart message to socket
bool send(socket_ref socket, int flags = 0)
{
flags &= ~(ZMQ_SNDMORE);
bool more = size() > 0;
while (more) {
message_t message = pop();
more = size() > 0;
#ifdef ZMQ_CPP11
if (!socket.send(message, static_cast<send_flags>(
(more ? ZMQ_SNDMORE : 0) | flags)))
return false;
#else
if (!socket.send(message, (more ? ZMQ_SNDMORE : 0) | flags))
return false;
#endif
}
clear();
return true;
}
// Concatenate other multipart to front
void prepend(multipart_t &&other)
{
while (!other.empty())
push(other.remove());
}
// Concatenate other multipart to back
void append(multipart_t &&other)
{
while (!other.empty())
add(other.pop());
}
// Push memory block to front
void pushmem(const void *src, size_t size)
{
m_parts.push_front(message_t(src, size));
}
// Push memory block to back
void addmem(const void *src, size_t size)
{
m_parts.push_back(message_t(src, size));
}
// Push string to front
void pushstr(const std::string &string)
{
m_parts.push_front(message_t(string.data(), string.size()));
}
// Push string to back
void addstr(const std::string &string)
{
m_parts.push_back(message_t(string.data(), string.size()));
}
// Push type (fixed-size) to front
template<typename T> void pushtyp(const T &type)
{
static_assert(!std::is_same<T, std::string>::value,
"Use pushstr() instead of pushtyp<std::string>()");
m_parts.push_front(message_t(&type, sizeof(type)));
}
// Push type (fixed-size) to back
template<typename T> void addtyp(const T &type)
{
static_assert(!std::is_same<T, std::string>::value,
"Use addstr() instead of addtyp<std::string>()");
m_parts.push_back(message_t(&type, sizeof(type)));
}
// Push message part to front
void push(message_t &&message) { m_parts.push_front(std::move(message)); }
// Push message part to back
void add(message_t &&message) { m_parts.push_back(std::move(message)); }
// Alias to allow std::back_inserter()
void push_back(message_t &&message) { m_parts.push_back(std::move(message)); }
// Pop string from front
std::string popstr()
{
std::string string(m_parts.front().data<char>(), m_parts.front().size());
m_parts.pop_front();
return string;
}
// Pop type (fixed-size) from front
template<typename T> T poptyp()
{
static_assert(!std::is_same<T, std::string>::value,
"Use popstr() instead of poptyp<std::string>()");
if (sizeof(T) != m_parts.front().size())
throw std::runtime_error(
"Invalid type, size does not match the message size");
T type = *m_parts.front().data<T>();
m_parts.pop_front();
return type;
}
// Pop message part from front
message_t pop()
{
message_t message = std::move(m_parts.front());
m_parts.pop_front();
return message;
}
// Pop message part from back
message_t remove()
{
message_t message = std::move(m_parts.back());
m_parts.pop_back();
return message;
}
// get message part from front
const message_t &front() { return m_parts.front(); }
// get message part from back
const message_t &back() { return m_parts.back(); }
// Get pointer to a specific message part
const message_t *peek(size_t index) const { return &m_parts[index]; }
// Get a string copy of a specific message part
std::string peekstr(size_t index) const
{
std::string string(m_parts[index].data<char>(), m_parts[index].size());
return string;
}
// Peek type (fixed-size) from front
template<typename T> T peektyp(size_t index) const
{
static_assert(!std::is_same<T, std::string>::value,
"Use peekstr() instead of peektyp<std::string>()");
if (sizeof(T) != m_parts[index].size())
throw std::runtime_error(
"Invalid type, size does not match the message size");
T type = *m_parts[index].data<T>();
return type;
}
// Create multipart from type (fixed-size)
template<typename T> static multipart_t create(const T &type)
{
multipart_t multipart;
multipart.addtyp(type);
return multipart;
}
// Copy multipart
multipart_t clone() const
{
multipart_t multipart;
for (size_t i = 0; i < size(); i++)
multipart.addmem(m_parts[i].data(), m_parts[i].size());
return multipart;
}
// Dump content to string
std::string str() const
{
std::stringstream ss;
for (size_t i = 0; i < m_parts.size(); i++) {
const unsigned char *data = m_parts[i].data<unsigned char>();
size_t size = m_parts[i].size();
// Dump the message as text or binary
bool isText = true;
for (size_t j = 0; j < size; j++) {
if (data[j] < 32 || data[j] > 127) {
isText = false;
break;
}
}
ss << "\n[" << std::dec << std::setw(3) << std::setfill('0') << size
<< "] ";
if (size >= 1000) {
ss << "... (too big to print)";
continue;
}
for (size_t j = 0; j < size; j++) {
if (isText)
ss << static_cast<char>(data[j]);
else
ss << std::hex << std::setw(2) << std::setfill('0')
<< static_cast<short>(data[j]);
}
}
return ss.str();
}
// Check if equal to other multipart
bool equal(const multipart_t *other) const ZMQ_NOTHROW
{
return *this == *other;
}
bool operator==(const multipart_t &other) const ZMQ_NOTHROW
{
if (size() != other.size())
return false;
for (size_t i = 0; i < size(); i++)
if (at(i) != other.at(i))
return false;
return true;
}
bool operator!=(const multipart_t &other) const ZMQ_NOTHROW
{
return !(*this == other);
}
#ifdef ZMQ_CPP11
// Return single part message_t encoded from this multipart_t.
message_t encode() const { return zmq::encode(*this); }
// Decode encoded message into multiple parts and append to self.
void decode_append(const message_t &encoded)
{
zmq::decode(encoded, std::back_inserter(*this));
}
// Return a new multipart_t containing the decoded message_t.
static multipart_t decode(const message_t &encoded)
{
multipart_t tmp;
zmq::decode(encoded, std::back_inserter(tmp));
return tmp;
}
#endif
private:
// Disable implicit copying (moving is more efficient)
multipart_t(const multipart_t &other) ZMQ_DELETED_FUNCTION;
void operator=(const multipart_t &other) ZMQ_DELETED_FUNCTION;
}; // class multipart_t
inline std::ostream &operator<<(std::ostream &os, const multipart_t &msg)
{
return os << msg.str();
}
#endif // ZMQ_HAS_RVALUE_REFS
#if defined(ZMQ_BUILD_DRAFT_API) && defined(ZMQ_CPP11) && defined(ZMQ_HAVE_POLLER)
class active_poller_t
{
public:
active_poller_t() = default;
~active_poller_t() = default;
active_poller_t(const active_poller_t &) = delete;
active_poller_t &operator=(const active_poller_t &) = delete;
active_poller_t(active_poller_t &&src) = default;
active_poller_t &operator=(active_poller_t &&src) = default;
using handler_type = std::function<void(event_flags)>;
void add(zmq::socket_ref socket, event_flags events, handler_type handler)
{
const poller_ref_t ref{socket};
if (!handler)
throw std::invalid_argument("null handler in active_poller_t::add (socket)");
auto ret = handlers.emplace(
ref, std::make_shared<handler_type>(std::move(handler)));
if (!ret.second)
throw error_t(EINVAL); // already added
try {
base_poller.add(socket, events, ret.first->second.get());
need_rebuild = true;
}
catch (...) {
// rollback
handlers.erase(ref);
throw;
}
}
void add(fd_t fd, event_flags events, handler_type handler)
{
const poller_ref_t ref{fd};
if (!handler)
throw std::invalid_argument("null handler in active_poller_t::add (fd)");
auto ret = handlers.emplace(
ref, std::make_shared<handler_type>(std::move(handler)));
if (!ret.second)
throw error_t(EINVAL); // already added
try {
base_poller.add(fd, events, ret.first->second.get());
need_rebuild = true;
}
catch (...) {
// rollback
handlers.erase(ref);
throw;
}
}
void remove(zmq::socket_ref socket)
{
base_poller.remove(socket);
handlers.erase(socket);
need_rebuild = true;
}
void remove(fd_t fd)
{
base_poller.remove(fd);
handlers.erase(fd);
need_rebuild = true;
}
void modify(zmq::socket_ref socket, event_flags events)
{
base_poller.modify(socket, events);
}
void modify(fd_t fd, event_flags events)
{
base_poller.modify(fd, events);
}
size_t wait(std::chrono::milliseconds timeout)
{
if (need_rebuild) {
poller_events.resize(handlers.size());
poller_handlers.clear();
poller_handlers.reserve(handlers.size());
for (const auto &handler : handlers) {
poller_handlers.push_back(handler.second);
}
need_rebuild = false;
}
const auto count = base_poller.wait_all(poller_events, timeout);
std::for_each(poller_events.begin(),
poller_events.begin() + static_cast<ptrdiff_t>(count),
[](decltype(base_poller)::event_type &event) {
assert(event.user_data != nullptr);
(*event.user_data)(event.events);
});
return count;
}
ZMQ_NODISCARD bool empty() const noexcept { return handlers.empty(); }
size_t size() const noexcept { return handlers.size(); }
private:
bool need_rebuild{false};
poller_t<handler_type> base_poller{};
std::unordered_map<zmq::poller_ref_t, std::shared_ptr<handler_type>> handlers{};
std::vector<decltype(base_poller)::event_type> poller_events{};
std::vector<std::shared_ptr<handler_type>> poller_handlers{};
}; // class active_poller_t
#endif // defined(ZMQ_BUILD_DRAFT_API) && defined(ZMQ_CPP11) && defined(ZMQ_HAVE_POLLER)
} // namespace zmq
#endif // __ZMQ_ADDON_HPP_INCLUDED__

File diff suppressed because it is too large Load diff