init

2025-09-05 10:49:03 +00:00 · 2024-10-23 09:42:32 +04:00 · 2024-10-23 09:42:32 +04:00 · 2a01ff5fb1
commit 2a01ff5fb1
parent 6374743747
10 changed files with 4725 additions and 1026 deletions
--- a/8
+++ b/8
@ -262,6 +262,14 @@ MK_CFLAGS    = -std=c11   -fPIC
 MK_CXXFLAGS  = -std=c++11 -fPIC
 MK_NVCCFLAGS = -std=c++11

+ifeq ($(UNAME_S),Darwin)
+    MK_CPPFLAGS += -I/opt/homebrew/include
+    MK_LDFLAGS  += -L/opt/homebrew/lib -lzmq
+else ifeq ($(UNAME_S),Linux)
+    MK_CPPFLAGS += -I/usr/local/include
+    MK_LDFLAGS  += -L/usr/local/lib -lzmq
+endif
+
 ifdef LLAMA_NO_CCACHE
 GGML_NO_CCACHE := 1
 DEPRECATE_WARNING := 1
--- a/README.md
+++ b/README.md
@ -1,479 +1,3 @@
-# llama.cpp
+# prima.cpp

-![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
-
-[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
-[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
-[![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp)
-
-[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
-
-Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
-
-## Recent API changes
-
- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289)
- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291)
-
-## Hot topics
-
- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669**
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
-
----
-
-## Description
-
-The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
-variety of hardware - locally and in the cloud.
-
- Plain C/C++ implementation without any dependencies
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
- AVX, AVX2 and AVX512 support for x86 architectures
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
- Vulkan and SYCL backend support
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
-
-Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
-improved significantly thanks to many contributions. It is the main playground for developing new features for the
-[ggml](https://github.com/ggerganov/ggml) library.
-
-**Supported models:**
-
-Typically finetunes of the base models below are supported as well.
-
- [X] LLaMA 🦙
- [x] LLaMA 2 🦙🦙
- [x] LLaMA 3 🦙🦙🦙
- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423)
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
- [X] [StableLM models](https://huggingface.co/stabilityai)
- [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
- [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
- [x] [GPT-2](https://huggingface.co/gpt2)
- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
- [x] [InternLM2](https://huggingface.co/models?search=internlm2)
- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
- [x] [Gemma](https://ai.google.dev/gemma)
- [x] [Mamba](https://github.com/state-spaces/mamba)
- [x] [Grok-1](https://huggingface.co/keyfan/grok-1-hf)
- [x] [Xverse](https://huggingface.co/models?search=xverse)
- [x] [Command-R models](https://huggingface.co/models?search=CohereForAI/c4ai-command-r)
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
- [x] [OLMo](https://allenai.org/olmo)
- [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924)
- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
- [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520)
- [x] [Smaug](https://huggingface.co/models?search=Smaug)
- [x] [Poro 34B](https://huggingface.co/LumiOpen/Poro-34B)
- [x] [Bitnet b1.58 models](https://huggingface.co/1bitLLM)
- [x] [Flan T5](https://huggingface.co/models?search=flan-t5)
- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
-
-(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
-
-**Multimodal models:**
-
- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
- [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
- [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
-
-**Bindings:**
-
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
- JS/TS (Programmable Prompt Engine CLI): [offline-ai/cli](https://github.com/offline-ai/cli)
- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
- Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs)
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
- Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
-
-**UI:**
-
-Unless otherwise noted these projects are open-source with permissive licensing:
-
- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
- [iohub/collama](https://github.com/iohub/coLLaMA)
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
- [nat/openplayground](https://github.com/nat/openplayground)
- [Faraday](https://faraday.dev/) (proprietary)
- [LMStudio](https://lmstudio.ai/) (proprietary)
- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
- [ramalama](https://github.com/containers/ramalama) (MIT)
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
- [ollama/ollama](https://github.com/ollama/ollama)
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
- [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
- [RAGNA Desktop](https://ragna.app/) (proprietary)
- [RecurseChat](https://recurse.chat/) (proprietary)
- [semperai/amica](https://github.com/semperai/amica)
- [withcatai/catai](https://github.com/withcatai/catai)
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
- [Msty](https://msty.app) (proprietary)
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later)
- [Dot](https://github.com/alexpinel/Dot) (GPL)
- [MindMac](https://mindmac.app) (proprietary)
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
- [AIKit](https://github.com/sozercan/aikit) (MIT)
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
-
-*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
-
-**Tools:**
-
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
- [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)
-
-**Infrastructure:**
-
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
-
-**Games:**
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
-
-## Demo
-
-<details>
-<summary>Typical run using LLaMA v2 13B on M2 Ultra</summary>
-
-```
-$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
-I llama.cpp build info:
-I UNAME_S:  Darwin
-I UNAME_P:  arm
-I UNAME_M:  arm64
-I CFLAGS:   -I.            -O3 -std=c11   -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -pthread -DGGML_USE_K_QUANTS -DGGML_USE_ACCELERATE
-I CXXFLAGS: -I. -I./common -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS
-I LDFLAGS:   -framework Accelerate
-I CC:       Apple clang version 14.0.3 (clang-1403.0.22.14.1)
-I CXX:      Apple clang version 14.0.3 (clang-1403.0.22.14.1)
-
-make: Nothing to be done for `default'.
-main: build = 1041 (cf658ad)
-main: seed  = 1692823051
-llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/llama-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
-llama_model_loader: - type  f32:   81 tensors
-llama_model_loader: - type q4_0:  281 tensors
-llama_model_loader: - type q6_K:    1 tensors
-llm_load_print_meta: format         = GGUF V1 (latest)
-llm_load_print_meta: arch           = llama
-llm_load_print_meta: vocab type     = SPM
-llm_load_print_meta: n_vocab        = 32000
-llm_load_print_meta: n_merges       = 0
-llm_load_print_meta: n_ctx_train    = 4096
-llm_load_print_meta: n_ctx          = 512
-llm_load_print_meta: n_embd         = 5120
-llm_load_print_meta: n_head         = 40
-llm_load_print_meta: n_head_kv      = 40
-llm_load_print_meta: n_layer        = 40
-llm_load_print_meta: n_rot          = 128
-llm_load_print_meta: n_gqa          = 1
-llm_load_print_meta: f_norm_eps     = 1.0e-05
-llm_load_print_meta: f_norm_rms_eps = 1.0e-05
-llm_load_print_meta: n_ff           = 13824
-llm_load_print_meta: freq_base      = 10000.0
-llm_load_print_meta: freq_scale     = 1
-llm_load_print_meta: model type     = 13B
-llm_load_print_meta: model ftype    = mostly Q4_0
-llm_load_print_meta: model size     = 13.02 B
-llm_load_print_meta: general.name   = LLaMA v2
-llm_load_print_meta: BOS token = 1 '<s>'
-llm_load_print_meta: EOS token = 2 '</s>'
-llm_load_print_meta: UNK token = 0 '<unk>'
-llm_load_print_meta: LF token  = 13 '<0x0A>'
-llm_load_tensors: ggml ctx size =    0.11 MB
-llm_load_tensors: mem required  = 7024.01 MB (+  400.00 MB per state)
-...................................................................................................
-llama_new_context_with_model: kv self size  =  400.00 MB
-llama_new_context_with_model: compute buffer total size =   75.41 MB
-
-system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
-sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
-generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0
-
-
- Building a website can be done in 10 simple steps:
-Step 1: Find the right website platform.
-Step 2: Choose your domain name and hosting plan.
-Step 3: Design your website layout.
-Step 4: Write your website content and add images.
-Step 5: Install security features to protect your site from hackers or spammers
-Step 6: Test your website on multiple browsers, mobile devices, operating systems etc…
-Step 7: Test it again with people who are not related to you personally – friends or family members will work just fine!
-Step 8: Start marketing and promoting the website via social media channels or paid ads
-Step 9: Analyze how many visitors have come to your site so far, what type of people visit more often than others (e.g., men vs women) etc…
-Step 10: Continue to improve upon all aspects mentioned above by following trends in web design and staying up-to-date on new technologies that can enhance user experience even further!
-How does a Website Work?
-A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit – whether it’s an image or text file (like PDFs). In order for someone else’s browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable!
-The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols – this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking.
-How to
-llama_print_timings:        load time =   576.45 ms
-llama_print_timings:      sample time =   283.10 ms /   400 runs   (    0.71 ms per token,  1412.91 tokens per second)
-llama_print_timings: prompt eval time =   599.83 ms /    19 tokens (   31.57 ms per token,    31.68 tokens per second)
-llama_print_timings:        eval time = 24513.59 ms /   399 runs   (   61.44 ms per token,    16.28 tokens per second)
-llama_print_timings:       total time = 25431.49 ms
-```
-
-</details>
-
-<details>
-<summary>Demo of running both LLaMA-7B and whisper.cpp on a single M1 Pro MacBook</summary>
-
-And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook:
-
-https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8b4f-add84093ffff.mp4
-
-</details>
-
-## Usage
-
-Here are the end-to-end binary build and model conversion steps for most supported models.
-
-### Basic usage
-
-Firstly, you need to get the binary. There are different methods that you can follow:
- Method 1: Clone this repository and build locally, see [how to build](./docs/build.md)
- Method 2: If you are using MacOS or Linux, you can install llama.cpp via [brew, flox or nix](./docs/install.md)
- Method 3: Use a Docker image, see [documentation for Docker](./docs/docker.md)
- Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/llama.cpp/releases)
-
-You can run a basic completion using this command:
-
-```bash
-llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128
-
-# Output:
-# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
-```
-
-See [this page](./examples/main/README.md) for a full list of parameters.
-
-### Conversation mode
-
-If you want a more ChatGPT-like experience, you can run in conversation mode by passing `-cnv` as a parameter:
-
-```bash
-llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv
-
-# Output:
-# > hi, who are you?
-# Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
-#
-# > what is 1+1?
-# Easy peasy! The answer to 1+1 is... 2!
-```
-
-By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
-
-```bash
-./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
-```
-
-You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters:
-
-```bash
-./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
-```
-
-### Web server
-
-[llama.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
-
-Example usage:
-
-```bash
-./llama-server -m your_model.gguf --port 8080
-
-# Basic web UI can be accessed via browser: http://localhost:8080
-# Chat completion endpoint: http://localhost:8080/v1/chat/completions
-```
-
-### Interactive mode
-
-> [!NOTE]
-> If you prefer basic usage, please consider using conversation mode instead of interactive mode
-
-In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
-
-Here is an example of a few-shot interaction, invoked with the command
-
-```bash
-# default arguments using a 7B model
-./examples/chat.sh
-
-# advanced chat with a 13B model
-./examples/chat-13B.sh
-
-# custom arguments using a 13B model
-./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
-```
-
-Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program.
-
-![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)
-
-### Persistent Interaction
-
-The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
-
-```bash
-# Start a new chat
-PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
-
-# Resume that chat
-PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
-
-# Start a different chat with the same prompt/model
-PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/another ./examples/chat-persistent.sh
-
-# Different prompt cache for different prompt/model
-PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
-    CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
-```
-
-### Constrained output with grammars
-
-`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
-
-```bash
-./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
-```
-
-The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
-
-For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
-
-## Build
-
-Please refer to [Build llama.cpp locally](./docs/build.md)
-
-## Supported backends
-
-| Backend | Target devices |
-| --- | --- |
-| [Metal](./docs/build.md#metal-build) | Apple Silicon |
-| [BLAS](./docs/build.md#blas-build) | All |
-| [BLIS](./docs/backend/BLIS.md) | All |
-| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
-| [MUSA](./docs/build.md#musa) | Moore Threads GPU |
-| [CUDA](./docs/build.md#cuda) | Nvidia GPU |
-| [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
-| [Vulkan](./docs/build.md#vulkan) | GPU |
-| [CANN](./docs/build.md#cann) | Ascend NPU |
-
-## Tools
-
-### Prepare and Quantize
-
-> [!NOTE]
-> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
-
-To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
-
-Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
-It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face.
-
-To learn more about quantizing model, [read this documentation](./examples/quantize/README.md)
-
-### Perplexity (measuring model quality)
-
-You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
-For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
-
-To learn more how to measure perplexity using llama.cpp, [read this documentation](./examples/perplexity/README.md)
-
-## Contributing
-
- Contributors can open PRs
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
- Collaborators will be invited based on contributions
- Any help with managing issues, PRs and projects is very appreciated!
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
- Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
-
-## Other documentations
-
- [main (cli)](./examples/main/README.md)
- [server](./examples/server/README.md)
- [jeopardy](./examples/jeopardy/README.md)
- [GBNF grammars](./grammars/README.md)
-
-**Development documentations**
-
- [How to build](./docs/build.md)
- [Running on Docker](./docs/docker.md)
- [Build on Android](./docs/android.md)
- [Performance troubleshooting](./docs/development/token_generation_performance_tips.md)
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
-
-**Seminal papers and background on the models**
-
-If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
- LLaMA:
-    - [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/)
-    - [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
- GPT-3
-    - [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
- GPT-3.5 / InstructGPT / ChatGPT:
-    - [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
-    - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
+This is a distributed implementation of [llama.cpp](https://github.com/ggerganov/llama.cpp), coming soon.
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -661,6 +661,41 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
            params.n_ctx = value;
        }
    ).set_env("LLAMA_ARG_CTX_SIZE"));
+    add_opt(llama_arg(
+        {"-w", "--world", "--world-size"}, "N",
+        format("number of devices to use (default: %d)", params.n_world),
+        [](gpt_params & params, int value) {
+            params.n_world = value;
+        }
+    ).set_env("LLAMA_ARG_N_WORLD"));
+    add_opt(llama_arg(
+        {"-r", "--rank", "--my-rank"}, "N",
+        format("my rank for distributed inference (default: %d)", params.rank),
+        [](gpt_params & params, int value) {
+            params.rank = value;
+        }
+    ).set_env("LLAMA_ARG_RANK"));
+    add_opt(llama_arg(
+        {"-lw", "--layer-window", "--n-layer-window"}, "N",
+        format("number of layers to process in each compute (default: %d)", params.n_layer_window),
+        [](gpt_params & params, int value) {
+            params.n_layer_window = value;
+        }
+    ).set_env("LLAMA_ARG_N_LAYER_WINDOW"));
+    add_opt(llama_arg(
+        {"-mip", "--master", "--master-ip"}, "IPAddress",
+        format("ip address of the master node (default: %s)", params.master_ip.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            params.master_ip = value;
+        }
+    ).set_env("LLAMA_ARG_MASTER_IP"));
+    add_opt(llama_arg(
+        {"-nip", "--next", "--next-node", "--next-ip", "--next-node-ip"}, "IPAddress",
+        format("ip address of the next node (default: %s)", params.next_node_ip.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            params.next_node_ip = value;
+        }
+    ).set_env("LLAMA_ARG_NEXT_NODE_IP"));
    add_opt(llama_arg(
        {"-n", "--predict", "--n-predict"}, "N",
        format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
--- a/common/common.cpp
+++ b/common/common.cpp
@ -872,6 +872,8 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
        return iparams;
    }

+    llama_init_sockets(lctx, cparams.n_world, cparams.rank);
+
    if (!params.control_vectors.empty()) {
        if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
        if (params.control_vector_layer_end   <= 0) params.control_vector_layer_end   = llama_n_layer(model);
@ -924,28 +926,34 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
    if (params.warmup) {
        LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);

+        const uint32_t my_rank = cparams.rank;
        std::vector<llama_token> tmp;
-        llama_token bos = llama_token_bos(model);
-        llama_token eos = llama_token_eos(model);
-        // some models (e.g. T5) don't have a BOS token
-        if (bos != LLAMA_TOKEN_NULL) {
-            tmp.push_back(bos);
-        }
-        if (eos != LLAMA_TOKEN_NULL) {
-            tmp.push_back(eos);
-        }
-        if (tmp.empty()) {
-            tmp.push_back(0);
-        }

-        if (llama_model_has_encoder(model)) {
-            llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
-            llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
-            if (decoder_start_token_id == -1) {
-                decoder_start_token_id = bos;
+        if (my_rank == 0) {
+            llama_token bos = llama_token_bos(model);
+            llama_token eos = llama_token_eos(model);
+            // some models (e.g. T5) don't have a BOS token
+            if (bos != LLAMA_TOKEN_NULL) {
+                tmp.push_back(bos);
+            }
+            if (eos != LLAMA_TOKEN_NULL) {
+                tmp.push_back(eos);
+            }
+            if (tmp.empty()) {
+                tmp.push_back(0);
+            }
+
+            if (llama_model_has_encoder(model)) {
+                throw std::runtime_error("this model is currently not supported");
+
+                llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
+                llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
+                if (decoder_start_token_id == -1) {
+                    decoder_start_token_id = bos;
+                }
+                tmp.clear();
+                tmp.push_back(decoder_start_token_id);
            }
-            tmp.clear();
-            tmp.push_back(decoder_start_token_id);
        }
        if (llama_model_has_decoder(model)) {
            llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
@ -976,6 +984,9 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
    if (params.n_gpu_layers != -1) {
        mparams.n_gpu_layers = params.n_gpu_layers;
    }
+    mparams.n_world         = params.n_world;
+    mparams.rank            = params.rank;
+    mparams.n_layer_window  = params.n_layer_window;
    mparams.rpc_servers     = params.rpc_servers.c_str();
    mparams.main_gpu        = params.main_gpu;
    mparams.split_mode      = params.split_mode;
@ -1025,6 +1036,22 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
    auto cparams = llama_context_default_params();

+    cparams.n_world           = params.n_world;
+    cparams.rank              = params.rank;
+    cparams.n_layer_window    = params.n_layer_window;
+
+    if (cparams.master_ip != nullptr) {
+        delete[] cparams.master_ip;
+    }
+    cparams.master_ip         = new char[params.master_ip.length() + 1];
+    std::strcpy(cparams.master_ip, params.master_ip.c_str());
+
+    if (cparams.next_node_ip != nullptr) {
+        delete[] cparams.next_node_ip;
+    }
+    cparams.next_node_ip      = new char[params.next_node_ip.length() + 1];
+    std::strcpy(cparams.next_node_ip, params.next_node_ip.c_str());
+
    cparams.n_ctx             = params.n_ctx;
    cparams.n_seq_max         = params.n_parallel;
    cparams.n_batch           = params.n_batch;
--- a/common/common.h
+++ b/common/common.h
@ -142,6 +142,11 @@ struct gpt_sampler_params {
 };

 struct gpt_params {
+    int32_t n_world               =     1; // number of devices to use
+    int32_t rank                  =     0; // my rank for distributed inference
+    int32_t n_layer_window        =    32; // number of layers to process in each compute
+    std::string master_ip         = "localhost"; // ip address of the master node
+    std::string next_node_ip      = "localhost"; // ip address of my next node
    int32_t n_predict             =    -1; // new tokens to predict
    int32_t n_ctx                 =     0; // context size
    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@ -229,8 +234,7 @@ struct gpt_params {
    bool   multiple_choice  = false;  // compute TruthfulQA score over random tasks from datafile supplied in prompt
    size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed

-    bool   kl_divergence    = false; // compute KL divergence
-
+    bool kl_divergence     = false; // compute KL divergence
    bool usage             = false; // print usage
    bool use_color         = false; // use color to distinguish generations and inputs
    bool special           = false; // enable special token output
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -14,6 +14,7 @@
 #include <sstream>
 #include <string>
 #include <vector>
+#include <thread>

 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 #include <signal.h>
@ -141,6 +142,9 @@ int main(int argc, char ** argv) {
    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
        return 1;
    }
+    const uint32_t n_world = params.n_world;
+    const uint32_t my_rank = params.rank;
+    GGML_ASSERT(!(n_world == 1 && my_rank > 0));

    gpt_init();

@ -151,22 +155,6 @@ int main(int argc, char ** argv) {
    console::init(params.simple_io, params.use_color);
    atexit([]() { console::cleanup(); });

-    if (params.logits_all) {
-        LOG_ERR("************\n");
-        LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
-        LOG_ERR("************\n\n");
-
-        return 0;
-    }
-
-    if (params.embedding) {
-        LOG_ERR("************\n");
-        LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
-        LOG_ERR("************\n\n");
-
-        return 0;
-    }
-
    if (params.n_ctx != 0 && params.n_ctx < 8) {
        LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
        params.n_ctx = 8;
@ -290,7 +278,7 @@ int main(int argc, char ** argv) {

    std::vector<llama_token> embd_inp;

-    {
+    if (my_rank == 0) {
        auto prompt = (params.conversation && params.enable_chat_template && !params.prompt.empty())
            ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
            : params.prompt;
@ -304,23 +292,23 @@ int main(int argc, char ** argv) {

        LOG_DBG("prompt: \"%s\"\n", prompt.c_str());
        LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
-    }

-    // Should not run without any tokens
-    if (embd_inp.empty()) {
-        if (add_bos) {
-            embd_inp.push_back(llama_token_bos(model));
-            LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
-        } else {
-            LOG_ERR("input is empty\n");
-            return -1;
+        // should not run without any tokens
+        if (embd_inp.empty()) {
+            if (add_bos) {
+                embd_inp.push_back(llama_token_bos(model));
+                LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
+            } else {
+                LOG_ERR("input is empty\n");
+                return -1;
+            }
        }
-    }

-    // Tokenize negative prompt
-    if ((int) embd_inp.size() > n_ctx - 4) {
-        LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
-        return 1;
+        // tokenize negative prompt
+        if ((int) embd_inp.size() > n_ctx - 4) {
+            LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+            return 1;
+        }
    }

    // debug message about similarity of saved session, if applicable
@ -448,18 +436,18 @@ int main(int argc, char ** argv) {
        }
    }

-    smpl = gpt_sampler_init(model, sparams);
-    if (!smpl) {
-        LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
-        return 1;
+    if (my_rank == 0) {
+        smpl = gpt_sampler_init(model, sparams);
+        if (!smpl) {
+            LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
+            return 1;
+        }
+        LOG_INF("sampler seed: %u\n",     gpt_sampler_get_seed(smpl));
+        LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
+        LOG_INF("sampler chain: %s\n",    gpt_sampler_print(smpl).c_str());
+        LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
    }

-    LOG_INF("sampler seed: %u\n",     gpt_sampler_get_seed(smpl));
-    LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
-    LOG_INF("sampler chain: %s\n",    gpt_sampler_print(smpl).c_str());
-
-    LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
-
    // group-attention state
    // number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
    int ga_i = 0;
@ -487,9 +475,7 @@ int main(int argc, char ** argv) {
                              " - If you want to submit another line, end your input with '\\'.\n";
        }
        LOG_INF("== Running in interactive mode. ==\n");
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-        LOG_INF(       " - Press Ctrl+C to interject at any time.\n");
-#endif
+        LOG_INF(       " - Enter quit or exit to quit chat.\n");
        LOG_INF(       "%s\n", control_message);

        is_interacting = params.interactive_first;
@ -525,6 +511,8 @@ int main(int argc, char ** argv) {
    }

    if (llama_model_has_encoder(model)) {
+        throw std::runtime_error("this model is currently not supported");
+
        int enc_input_size = embd_inp.size();
        llama_token * enc_input_buf = embd_inp.data();

@ -542,9 +530,16 @@ int main(int argc, char ** argv) {
        embd_inp.push_back(decoder_start_token_id);
    }

+    char * stop_signal = nullptr;
+    std::thread signal_thread;
+
+    if (my_rank != 0) {
+        signal_thread = std::thread(llama_free_sockets, ctx, &stop_signal);
+    }
+
    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
        // predict
-        if (!embd.empty()) {
+        if (!embd.empty() || my_rank != 0) {
            // Note: (n_ctx - 4) here is to match the logic for commandline prompt handling via
            // --prompt or --file which uses the same value.
            int max_embd_size = n_ctx - 4;
@ -640,25 +635,22 @@ int main(int argc, char ** argv) {
                }
            }

-            for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
-                int n_eval = (int) embd.size() - i;
-                if (n_eval > params.n_batch) {
-                    n_eval = params.n_batch;
+            if (my_rank == 0) {
+                for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
+                    int n_eval = (int) embd.size() - i;
+                    if (n_eval > params.n_batch) {
+                        n_eval = params.n_batch;
+                    }
+                    if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0)) != 0) {
+                        LOG_ERR("%s : failed to eval\n", __func__);
+                        return 1;
+                    }
+                    n_past += n_eval;
                }
-
-                LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
-
-                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
-                    LOG_ERR("%s : failed to eval\n", __func__);
-                    return 1;
-                }
-
-                n_past += n_eval;
-
-                LOG_DBG("n_past = %d\n", n_past);
-                // Display total tokens alongside total time
-                if (params.n_print > 0 && n_past % params.n_print == 0) {
-                    LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
+            } else {
+                llama_decode(ctx, llama_batch_get_one(embd.data(), 0, 0, 0));
+                if (stop_signal != nullptr && std::strcmp(stop_signal, "STOP") == 0) {
+                    break;
                }
            }

@ -670,70 +662,70 @@ int main(int argc, char ** argv) {

        embd.clear();

-        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-            // optionally save the session on first sample (for faster prompt loading next time)
-            if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
-                need_to_save_session = false;
-                llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+        if (my_rank == 0) {
+            if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
+                // optionally save the session on first sample (for faster prompt loading next time)
+                if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
+                    need_to_save_session = false;
+                    llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+                    LOG_DBG("saved session to %s\n", path_session.c_str());
+                }

-                LOG_DBG("saved session to %s\n", path_session.c_str());
-            }
+                const llama_token id = gpt_sampler_sample(smpl, ctx, -1);

-            const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
+                gpt_sampler_accept(smpl, id, /* accept_grammar= */ true);

-            gpt_sampler_accept(smpl, id, /* accept_grammar= */ true);
+                embd.push_back(id);

-            // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
+                // echo this to console
+                input_echo = true;

-            embd.push_back(id);
+                // decrement remaining sampling budget
+                --n_remain;

-            // echo this to console
-            input_echo = true;
+                LOG_DBG("n_remain: %d\n", n_remain);
+            } else {
+                // some user input remains from prompt or interaction, forward it to processing
+                while ((int) embd_inp.size() > n_consumed) {
+                    embd.push_back(embd_inp[n_consumed]);

-            // decrement remaining sampling budget
-            --n_remain;
+                    // push the prompt in the sampling context in order to apply repetition penalties later
+                    // for the prompt, we don't apply grammar rules
+                    gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);

-            LOG_DBG("n_remain: %d\n", n_remain);
-        } else {
-            // some user input remains from prompt or interaction, forward it to processing
-            LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
-            while ((int) embd_inp.size() > n_consumed) {
-                embd.push_back(embd_inp[n_consumed]);
-
-                // push the prompt in the sampling context in order to apply repetition penalties later
-                // for the prompt, we don't apply grammar rules
-                gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
-
-                ++n_consumed;
-                if ((int) embd.size() >= params.n_batch) {
-                    break;
+                    ++n_consumed;
+                    if ((int) embd.size() >= params.n_batch) {
+                        break;
+                    }
                }
            }
        }

        // display text
-        if (input_echo && display) {
-            for (auto id : embd) {
-                const std::string token_str = llama_token_to_piece(ctx, id, params.special);
+        if (my_rank == 0) {
+            if (input_echo && display) {
+                for (auto id : embd) {
+                    const std::string token_str = llama_token_to_piece(ctx, id, params.special);

-                // Console/Stream Output
-                LOG("%s", token_str.c_str());
+                    // Console/Stream Output
+                    LOG("%s", token_str.c_str());

-                // Record Displayed Tokens To Log
-                // Note: Generated tokens are created one by one hence this check
-                if (embd.size() > 1) {
-                    // Incoming Requested Tokens
-                    input_tokens.push_back(id);
-                } else {
-                    // Outgoing Generated Tokens
-                    output_tokens.push_back(id);
-                    output_ss << token_str;
+                    // Record Displayed Tokens To Log
+                    // Note: Generated tokens are created one by one hence this check
+                    if (embd.size() > 1) {
+                        // Incoming Requested Tokens
+                        input_tokens.push_back(id);
+                    } else {
+                        // Outgoing Generated Tokens
+                        output_tokens.push_back(id);
+                        output_ss << token_str;
+                    }
                }
            }
        }

        // reset color to default if there is no pending user input
-        if (input_echo && (int) embd_inp.size() == n_consumed) {
+        if (my_rank == 0 && input_echo && (int) embd_inp.size() == n_consumed) {
            console::set_display(console::reset);
            display = true;
        }
@ -782,7 +774,7 @@ int main(int argc, char ** argv) {
            }

            // deal with end of generation tokens in interactive mode
-            if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
+            if (my_rank == 0 && llama_token_is_eog(model, gpt_sampler_last(smpl))) {
                LOG_DBG("found an EOG token\n");

                if (params.interactive) {
@ -840,6 +832,10 @@ int main(int argc, char ** argv) {
                console::set_display(console::reset);
                display = true;

+                if (buffer == "quit\n" || buffer == "exit\n") {
+                    break;
+                }
+
                // Add tokens to embd only if the input buffer is non-empty
                // Entering a empty line lets the user pass control back
                if (buffer.length() > 1) {
@ -924,19 +920,20 @@ int main(int argc, char ** argv) {
        llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
    }

-    LOG("\n\n");
-    gpt_perf_print(ctx, smpl);
-    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
-
-    gpt_sampler_free(smpl);
-
+    if (my_rank == 0) {
+        LOG("\n\n");
+        gpt_perf_print(ctx, smpl);
+        write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
+        gpt_sampler_free(smpl);
+        llama_free_sockets(ctx, &stop_signal);
+    }
+    if (my_rank != 0 && signal_thread.joinable()) {
+        signal_thread.join();
+    }
    llama_free(ctx);
    llama_free_model(model);
-
    llama_backend_free();
-
    ggml_threadpool_free(threadpool);
    ggml_threadpool_free(threadpool_batch);
-
    return 0;
-}
+}
--- a/include/llama.h
+++ b/include/llama.h
@ -276,6 +276,9 @@ extern "C" {
    };

    struct llama_model_params {
+        uint32_t n_world; // number of nodes
+        uint32_t rank; // my node rank
+        uint32_t n_layer_window; // number of layers to kept each time
        int32_t n_gpu_layers; // number of layers to store in VRAM
        enum llama_split_mode split_mode; // how to split the model across multiple GPUs

@ -312,12 +315,17 @@ extern "C" {
    // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
    //       https://github.com/ggerganov/llama.cpp/pull/7544
    struct llama_context_params {
-        uint32_t n_ctx;             // text context, 0 = from model
-        uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
-        uint32_t n_ubatch;          // physical maximum batch size
-        uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
-        int32_t  n_threads;         // number of threads to use for generation
-        int32_t  n_threads_batch;   // number of threads to use for batch processing
+        uint32_t    n_world;           // world size
+        uint32_t    rank;              // my rank
+        uint32_t    n_layer_window;    // number of layers to process in each compute
+        char *      master_ip;         // ip address of the master node
+        char *      next_node_ip;      // ip address of the next node
+        uint32_t    n_ctx;             // text context, 0 = from model
+        uint32_t    n_batch;           // logical maximum batch size that can be submitted to llama_decode
+        uint32_t    n_ubatch;          // physical maximum batch size
+        uint32_t    n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
+        int32_t     n_threads;         // number of threads to use for generation
+        int32_t     n_threads_batch;   // number of threads to use for batch processing

        enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
        enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
@ -418,6 +426,9 @@ extern "C" {

    LLAMA_API void llama_free_model(struct llama_model * model);

+    LLAMA_API void llama_init_sockets(struct llama_context * ctx, uint32_t n_world, uint32_t my_rank);
+    LLAMA_API void llama_free_sockets(struct llama_context * ctx, char ** msg);
+
    // TODO: rename to llama_init_from_model
    LLAMA_API struct llama_context * llama_new_context_with_model(
                     struct llama_model * model,
--- a/include/zmq.hpp
+++ b/include/zmq.hpp
--- a/include/zmq_addon.hpp
+++ b/include/zmq_addon.hpp
@ -0,0 +1,848 @@
+/*
+    Copyright (c) 2016-2017 ZeroMQ community
+    Copyright (c) 2016 VOCA AS / Harald Nøkland
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to
+    deal in the Software without restriction, including without limitation the
+    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+    sell copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in
+    all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+    IN THE SOFTWARE.
+*/
+
+#ifndef __ZMQ_ADDON_HPP_INCLUDED__
+#define __ZMQ_ADDON_HPP_INCLUDED__
+
+#include "zmq.hpp"
+
+#include <deque>
+#include <iomanip>
+#include <sstream>
+#include <stdexcept>
+#ifdef ZMQ_CPP11
+#include <limits>
+#include <functional>
+#include <unordered_map>
+
+namespace zmq
+{
+	// socket ref or native file descriptor for poller
+	class poller_ref_t
+	{
+	public:
+		enum RefType
+		{
+			RT_SOCKET,
+			RT_FD
+		};
+
+		poller_ref_t() : poller_ref_t(socket_ref{})
+		{}
+
+		poller_ref_t(const zmq::socket_ref& socket) : data{RT_SOCKET, socket, {}}
+		{}
+
+		poller_ref_t(zmq::fd_t fd) : data{RT_FD, {}, fd}
+		{}
+
+		size_t hash() const ZMQ_NOTHROW	
+		{
+			std::size_t h = 0;
+			hash_combine(h, std::get<0>(data));
+        	hash_combine(h, std::get<1>(data));
+        	hash_combine(h, std::get<2>(data));
+			return h;
+		}
+
+		bool operator == (const poller_ref_t& o) const ZMQ_NOTHROW
+		{
+			return data == o.data;
+		}
+
+	private:
+		template <class T>
+		static void hash_combine(std::size_t& seed, const T& v) ZMQ_NOTHROW
+		{
+    		std::hash<T> hasher;
+    		seed ^= hasher(v) + 0x9e3779b9 + (seed<<6) + (seed>>2);
+		}
+
+		std::tuple<int, zmq::socket_ref, zmq::fd_t> data;
+
+	}; // class poller_ref_t
+
+} // namespace zmq
+
+// std::hash<> specialization for std::unordered_map
+template <> struct std::hash<zmq::poller_ref_t>
+{
+	size_t operator()(const zmq::poller_ref_t& ref) const ZMQ_NOTHROW
+	{
+		return ref.hash();
+	}
+};
+#endif //  ZMQ_CPP11
+
+namespace zmq
+{
+#ifdef ZMQ_CPP11
+
+namespace detail
+{
+template<bool CheckN, class OutputIt>
+recv_result_t
+recv_multipart_n(socket_ref s, OutputIt out, size_t n, recv_flags flags)
+{
+    size_t msg_count = 0;
+    message_t msg;
+    while (true) {
+        if ZMQ_CONSTEXPR_IF (CheckN) {
+            if (msg_count >= n)
+                throw std::runtime_error(
+                  "Too many message parts in recv_multipart_n");
+        }
+        if (!s.recv(msg, flags)) {
+            // zmq ensures atomic delivery of messages
+            assert(msg_count == 0);
+            return {};
+        }
+        ++msg_count;
+        const bool more = msg.more();
+        *out++ = std::move(msg);
+        if (!more)
+            break;
+    }
+    return msg_count;
+}
+
+inline bool is_little_endian()
+{
+    const uint16_t i = 0x01;
+    return *reinterpret_cast<const uint8_t *>(&i) == 0x01;
+}
+
+inline void write_network_order(unsigned char *buf, const uint32_t value)
+{
+    if (is_little_endian()) {
+        ZMQ_CONSTEXPR_VAR uint32_t mask = (std::numeric_limits<std::uint8_t>::max)();
+        *buf++ = static_cast<unsigned char>((value >> 24) & mask);
+        *buf++ = static_cast<unsigned char>((value >> 16) & mask);
+        *buf++ = static_cast<unsigned char>((value >> 8) & mask);
+        *buf++ = static_cast<unsigned char>(value & mask);
+    } else {
+        std::memcpy(buf, &value, sizeof(value));
+    }
+}
+
+inline uint32_t read_u32_network_order(const unsigned char *buf)
+{
+    if (is_little_endian()) {
+        return (static_cast<uint32_t>(buf[0]) << 24)
+               + (static_cast<uint32_t>(buf[1]) << 16)
+               + (static_cast<uint32_t>(buf[2]) << 8)
+               + static_cast<uint32_t>(buf[3]);
+    } else {
+        uint32_t value;
+        std::memcpy(&value, buf, sizeof(value));
+        return value;
+    }
+}
+} // namespace detail
+
+/*  Receive a multipart message.
+    
+    Writes the zmq::message_t objects to OutputIterator out.
+    The out iterator must handle an unspecified number of writes,
+    e.g. by using std::back_inserter.
+    
+    Returns: the number of messages received or nullopt (on EAGAIN).
+    Throws: if recv throws. Any exceptions thrown
+    by the out iterator will be propagated and the message
+    may have been only partially received with pending
+    message parts. It is adviced to close this socket in that event.
+*/
+template<class OutputIt>
+ZMQ_NODISCARD recv_result_t recv_multipart(socket_ref s,
+                                           OutputIt out,
+                                           recv_flags flags = recv_flags::none)
+{
+    return detail::recv_multipart_n<false>(s, std::move(out), 0, flags);
+}
+
+/*  Receive a multipart message.
+    
+    Writes at most n zmq::message_t objects to OutputIterator out.
+    If the number of message parts of the incoming message exceeds n
+    then an exception will be thrown.
+    
+    Returns: the number of messages received or nullopt (on EAGAIN).
+    Throws: if recv throws. Throws std::runtime_error if the number
+    of message parts exceeds n (exactly n messages will have been written
+    to out). Any exceptions thrown
+    by the out iterator will be propagated and the message
+    may have been only partially received with pending
+    message parts. It is adviced to close this socket in that event.
+*/
+template<class OutputIt>
+ZMQ_NODISCARD recv_result_t recv_multipart_n(socket_ref s,
+                                             OutputIt out,
+                                             size_t n,
+                                             recv_flags flags = recv_flags::none)
+{
+    return detail::recv_multipart_n<true>(s, std::move(out), n, flags);
+}
+
+/*  Send a multipart message.
+    
+    The range must be a ForwardRange of zmq::message_t,
+    zmq::const_buffer or zmq::mutable_buffer.
+    The flags may be zmq::send_flags::sndmore if there are 
+    more message parts to be sent after the call to this function.
+    
+    Returns: the number of messages sent (exactly msgs.size()) or nullopt (on EAGAIN).
+    Throws: if send throws. Any exceptions thrown
+    by the msgs range will be propagated and the message
+    may have been only partially sent. It is adviced to close this socket in that event.
+*/
+template<class Range
+#ifndef ZMQ_CPP11_PARTIAL
+         ,
+         typename = typename std::enable_if<
+           detail::is_range<Range>::value
+           && (std::is_same<detail::range_value_t<Range>, message_t>::value
+               || detail::is_buffer<detail::range_value_t<Range>>::value)>::type
+#endif
+         >
+send_result_t
+send_multipart(socket_ref s, Range &&msgs, send_flags flags = send_flags::none)
+{
+    using std::begin;
+    using std::end;
+    auto it = begin(msgs);
+    const auto end_it = end(msgs);
+    size_t msg_count = 0;
+    while (it != end_it) {
+        const auto next = std::next(it);
+        const auto msg_flags =
+          flags | (next == end_it ? send_flags::none : send_flags::sndmore);
+        if (!s.send(*it, msg_flags)) {
+            // zmq ensures atomic delivery of messages
+            assert(it == begin(msgs));
+            return {};
+        }
+        ++msg_count;
+        it = next;
+    }
+    return msg_count;
+}
+
+/* Encode a multipart message.
+
+   The range must be a ForwardRange of zmq::message_t.  A
+   zmq::multipart_t or STL container may be passed for encoding.
+
+   Returns: a zmq::message_t holding the encoded multipart data.
+
+   Throws: std::range_error is thrown if the size of any single part
+   can not fit in an unsigned 32 bit integer.
+
+   The encoding is compatible with that used by the CZMQ function
+   zmsg_encode(), see https://rfc.zeromq.org/spec/50/.
+   Each part consists of a size followed by the data.
+   These are placed contiguously into the output message.  A part of
+   size less than 255 bytes will have a single byte size value.
+   Larger parts will have a five byte size value with the first byte
+   set to 0xFF and the remaining four bytes holding the size of the
+   part's data.
+*/
+template<class Range
+#ifndef ZMQ_CPP11_PARTIAL
+         ,
+         typename = typename std::enable_if<
+           detail::is_range<Range>::value
+           && (std::is_same<detail::range_value_t<Range>, message_t>::value
+               || detail::is_buffer<detail::range_value_t<Range>>::value)>::type
+#endif
+         >
+message_t encode(const Range &parts)
+{
+    size_t mmsg_size = 0;
+
+    // First pass check sizes
+    for (const auto &part : parts) {
+        const size_t part_size = part.size();
+        if (part_size > (std::numeric_limits<std::uint32_t>::max)()) {
+            // Size value must fit into uint32_t.
+            throw std::range_error("Invalid size, message part too large");
+        }
+        const size_t count_size =
+          part_size < (std::numeric_limits<std::uint8_t>::max)() ? 1 : 5;
+        mmsg_size += part_size + count_size;
+    }
+
+    message_t encoded(mmsg_size);
+    unsigned char *buf = encoded.data<unsigned char>();
+    for (const auto &part : parts) {
+        const uint32_t part_size = static_cast<uint32_t>(part.size());
+        const unsigned char *part_data =
+          static_cast<const unsigned char *>(part.data());
+
+        if (part_size < (std::numeric_limits<std::uint8_t>::max)()) {
+            // small part
+            *buf++ = (unsigned char) part_size;
+        } else {
+            // big part
+            *buf++ = (std::numeric_limits<uint8_t>::max)();
+            detail::write_network_order(buf, part_size);
+            buf += sizeof(part_size);
+        }
+        std::memcpy(buf, part_data, part_size);
+        buf += part_size;
+    }
+
+    assert(static_cast<size_t>(buf - encoded.data<unsigned char>()) == mmsg_size);
+    return encoded;
+}
+
+/*  Decode an encoded message to multiple parts.
+
+    The given output iterator must be a ForwardIterator to a container
+    holding zmq::message_t such as a zmq::multipart_t or various STL
+    containers.
+
+    Returns the ForwardIterator advanced once past the last decoded
+    part.
+
+    Throws: a std::out_of_range is thrown if the encoded part sizes
+    lead to exceeding the message data bounds.
+
+    The decoding assumes the message is encoded in the manner
+    performed by zmq::encode(), see https://rfc.zeromq.org/spec/50/.
+ */
+template<class OutputIt> OutputIt decode(const message_t &encoded, OutputIt out)
+{
+    const unsigned char *source = encoded.data<unsigned char>();
+    const unsigned char *const limit = source + encoded.size();
+
+    while (source < limit) {
+        size_t part_size = *source++;
+        if (part_size == (std::numeric_limits<std::uint8_t>::max)()) {
+            if (static_cast<size_t>(limit - source) < sizeof(uint32_t)) {
+                throw std::out_of_range(
+                  "Malformed encoding, overflow in reading size");
+            }
+            part_size = detail::read_u32_network_order(source);
+            // the part size is allowed to be less than 0xFF
+            source += sizeof(uint32_t);
+        }
+
+        if (static_cast<size_t>(limit - source) < part_size) {
+            throw std::out_of_range("Malformed encoding, overflow in reading part");
+        }
+        *out = message_t(source, part_size);
+        ++out;
+        source += part_size;
+    }
+
+    assert(source == limit);
+    return out;
+}
+
+#endif
+
+
+#ifdef ZMQ_HAS_RVALUE_REFS
+
+/*
+    This class handles multipart messaging. It is the C++ equivalent of zmsg.h,
+    which is part of CZMQ (the high-level C binding). Furthermore, it is a major
+    improvement compared to zmsg.hpp, which is part of the examples in the ØMQ
+    Guide. Unnecessary copying is avoided by using move semantics to efficiently
+    add/remove parts.
+*/
+class multipart_t
+{
+  private:
+    std::deque<message_t> m_parts;
+
+  public:
+    typedef std::deque<message_t>::value_type value_type;
+
+    typedef std::deque<message_t>::iterator iterator;
+    typedef std::deque<message_t>::const_iterator const_iterator;
+
+    typedef std::deque<message_t>::reverse_iterator reverse_iterator;
+    typedef std::deque<message_t>::const_reverse_iterator const_reverse_iterator;
+
+    // Default constructor
+    multipart_t() {}
+
+    // Construct from socket receive
+    multipart_t(socket_ref socket) { recv(socket); }
+
+    // Construct from memory block
+    multipart_t(const void *src, size_t size) { addmem(src, size); }
+
+    // Construct from string
+    multipart_t(const std::string &string) { addstr(string); }
+
+    // Construct from message part
+    multipart_t(message_t &&message) { add(std::move(message)); }
+
+    // Move constructor
+    multipart_t(multipart_t &&other) ZMQ_NOTHROW { m_parts = std::move(other.m_parts); }
+
+    // Move assignment operator
+    multipart_t &operator=(multipart_t &&other) ZMQ_NOTHROW
+    {
+        m_parts = std::move(other.m_parts);
+        return *this;
+    }
+
+    // Destructor
+    virtual ~multipart_t() { clear(); }
+
+    message_t &operator[](size_t n) { return m_parts[n]; }
+
+    const message_t &operator[](size_t n) const { return m_parts[n]; }
+
+    message_t &at(size_t n) { return m_parts.at(n); }
+
+    const message_t &at(size_t n) const { return m_parts.at(n); }
+
+    iterator begin() { return m_parts.begin(); }
+
+    const_iterator begin() const { return m_parts.begin(); }
+
+    const_iterator cbegin() const { return m_parts.cbegin(); }
+
+    reverse_iterator rbegin() { return m_parts.rbegin(); }
+
+    const_reverse_iterator rbegin() const { return m_parts.rbegin(); }
+
+    iterator end() { return m_parts.end(); }
+
+    const_iterator end() const { return m_parts.end(); }
+
+    const_iterator cend() const { return m_parts.cend(); }
+
+    reverse_iterator rend() { return m_parts.rend(); }
+
+    const_reverse_iterator rend() const { return m_parts.rend(); }
+
+    // Delete all parts
+    void clear() { m_parts.clear(); }
+
+    // Get number of parts
+    size_t size() const { return m_parts.size(); }
+
+    // Check if number of parts is zero
+    bool empty() const { return m_parts.empty(); }
+
+    // Receive multipart message from socket
+    bool recv(socket_ref socket, int flags = 0)
+    {
+        clear();
+        bool more = true;
+        while (more) {
+            message_t message;
+#ifdef ZMQ_CPP11
+            if (!socket.recv(message, static_cast<recv_flags>(flags)))
+                return false;
+#else
+            if (!socket.recv(&message, flags))
+                return false;
+#endif
+            more = message.more();
+            add(std::move(message));
+        }
+        return true;
+    }
+
+    // Send multipart message to socket
+    bool send(socket_ref socket, int flags = 0)
+    {
+        flags &= ~(ZMQ_SNDMORE);
+        bool more = size() > 0;
+        while (more) {
+            message_t message = pop();
+            more = size() > 0;
+#ifdef ZMQ_CPP11
+            if (!socket.send(message, static_cast<send_flags>(
+                                        (more ? ZMQ_SNDMORE : 0) | flags)))
+                return false;
+#else
+            if (!socket.send(message, (more ? ZMQ_SNDMORE : 0) | flags))
+                return false;
+#endif
+        }
+        clear();
+        return true;
+    }
+
+    // Concatenate other multipart to front
+    void prepend(multipart_t &&other)
+    {
+        while (!other.empty())
+            push(other.remove());
+    }
+
+    // Concatenate other multipart to back
+    void append(multipart_t &&other)
+    {
+        while (!other.empty())
+            add(other.pop());
+    }
+
+    // Push memory block to front
+    void pushmem(const void *src, size_t size)
+    {
+        m_parts.push_front(message_t(src, size));
+    }
+
+    // Push memory block to back
+    void addmem(const void *src, size_t size)
+    {
+        m_parts.push_back(message_t(src, size));
+    }
+
+    // Push string to front
+    void pushstr(const std::string &string)
+    {
+        m_parts.push_front(message_t(string.data(), string.size()));
+    }
+
+    // Push string to back
+    void addstr(const std::string &string)
+    {
+        m_parts.push_back(message_t(string.data(), string.size()));
+    }
+
+    // Push type (fixed-size) to front
+    template<typename T> void pushtyp(const T &type)
+    {
+        static_assert(!std::is_same<T, std::string>::value,
+                      "Use pushstr() instead of pushtyp<std::string>()");
+        m_parts.push_front(message_t(&type, sizeof(type)));
+    }
+
+    // Push type (fixed-size) to back
+    template<typename T> void addtyp(const T &type)
+    {
+        static_assert(!std::is_same<T, std::string>::value,
+                      "Use addstr() instead of addtyp<std::string>()");
+        m_parts.push_back(message_t(&type, sizeof(type)));
+    }
+
+    // Push message part to front
+    void push(message_t &&message) { m_parts.push_front(std::move(message)); }
+
+    // Push message part to back
+    void add(message_t &&message) { m_parts.push_back(std::move(message)); }
+
+    // Alias to allow std::back_inserter()
+    void push_back(message_t &&message) { m_parts.push_back(std::move(message)); }
+
+    // Pop string from front
+    std::string popstr()
+    {
+        std::string string(m_parts.front().data<char>(), m_parts.front().size());
+        m_parts.pop_front();
+        return string;
+    }
+
+    // Pop type (fixed-size) from front
+    template<typename T> T poptyp()
+    {
+        static_assert(!std::is_same<T, std::string>::value,
+                      "Use popstr() instead of poptyp<std::string>()");
+        if (sizeof(T) != m_parts.front().size())
+            throw std::runtime_error(
+              "Invalid type, size does not match the message size");
+        T type = *m_parts.front().data<T>();
+        m_parts.pop_front();
+        return type;
+    }
+
+    // Pop message part from front
+    message_t pop()
+    {
+        message_t message = std::move(m_parts.front());
+        m_parts.pop_front();
+        return message;
+    }
+
+    // Pop message part from back
+    message_t remove()
+    {
+        message_t message = std::move(m_parts.back());
+        m_parts.pop_back();
+        return message;
+    }
+
+    // get message part from front
+    const message_t &front() { return m_parts.front(); }
+
+    // get message part from back
+    const message_t &back() { return m_parts.back(); }
+
+    // Get pointer to a specific message part
+    const message_t *peek(size_t index) const { return &m_parts[index]; }
+
+    // Get a string copy of a specific message part
+    std::string peekstr(size_t index) const
+    {
+        std::string string(m_parts[index].data<char>(), m_parts[index].size());
+        return string;
+    }
+
+    // Peek type (fixed-size) from front
+    template<typename T> T peektyp(size_t index) const
+    {
+        static_assert(!std::is_same<T, std::string>::value,
+                      "Use peekstr() instead of peektyp<std::string>()");
+        if (sizeof(T) != m_parts[index].size())
+            throw std::runtime_error(
+              "Invalid type, size does not match the message size");
+        T type = *m_parts[index].data<T>();
+        return type;
+    }
+
+    // Create multipart from type (fixed-size)
+    template<typename T> static multipart_t create(const T &type)
+    {
+        multipart_t multipart;
+        multipart.addtyp(type);
+        return multipart;
+    }
+
+    // Copy multipart
+    multipart_t clone() const
+    {
+        multipart_t multipart;
+        for (size_t i = 0; i < size(); i++)
+            multipart.addmem(m_parts[i].data(), m_parts[i].size());
+        return multipart;
+    }
+
+    // Dump content to string
+    std::string str() const
+    {
+        std::stringstream ss;
+        for (size_t i = 0; i < m_parts.size(); i++) {
+            const unsigned char *data = m_parts[i].data<unsigned char>();
+            size_t size = m_parts[i].size();
+
+            // Dump the message as text or binary
+            bool isText = true;
+            for (size_t j = 0; j < size; j++) {
+                if (data[j] < 32 || data[j] > 127) {
+                    isText = false;
+                    break;
+                }
+            }
+            ss << "\n[" << std::dec << std::setw(3) << std::setfill('0') << size
+               << "] ";
+            if (size >= 1000) {
+                ss << "... (too big to print)";
+                continue;
+            }
+            for (size_t j = 0; j < size; j++) {
+                if (isText)
+                    ss << static_cast<char>(data[j]);
+                else
+                    ss << std::hex << std::setw(2) << std::setfill('0')
+                       << static_cast<short>(data[j]);
+            }
+        }
+        return ss.str();
+    }
+
+    // Check if equal to other multipart
+    bool equal(const multipart_t *other) const ZMQ_NOTHROW
+    {
+        return *this == *other;
+    }
+
+    bool operator==(const multipart_t &other) const ZMQ_NOTHROW
+    {
+        if (size() != other.size())
+            return false;
+        for (size_t i = 0; i < size(); i++)
+            if (at(i) != other.at(i))
+                return false;
+        return true;
+    }
+
+    bool operator!=(const multipart_t &other) const ZMQ_NOTHROW
+    {
+        return !(*this == other);
+    }
+
+#ifdef ZMQ_CPP11
+
+    // Return single part message_t encoded from this multipart_t.
+    message_t encode() const { return zmq::encode(*this); }
+
+    // Decode encoded message into multiple parts and append to self.
+    void decode_append(const message_t &encoded)
+    {
+        zmq::decode(encoded, std::back_inserter(*this));
+    }
+
+    // Return a new multipart_t containing the decoded message_t.
+    static multipart_t decode(const message_t &encoded)
+    {
+        multipart_t tmp;
+        zmq::decode(encoded, std::back_inserter(tmp));
+        return tmp;
+    }
+
+#endif
+
+  private:
+    // Disable implicit copying (moving is more efficient)
+    multipart_t(const multipart_t &other) ZMQ_DELETED_FUNCTION;
+    void operator=(const multipart_t &other) ZMQ_DELETED_FUNCTION;
+}; // class multipart_t
+
+inline std::ostream &operator<<(std::ostream &os, const multipart_t &msg)
+{
+    return os << msg.str();
+}
+
+#endif // ZMQ_HAS_RVALUE_REFS
+
+#if defined(ZMQ_BUILD_DRAFT_API) && defined(ZMQ_CPP11) && defined(ZMQ_HAVE_POLLER)
+class active_poller_t
+{
+  public:
+    active_poller_t() = default;
+    ~active_poller_t() = default;
+
+    active_poller_t(const active_poller_t &) = delete;
+    active_poller_t &operator=(const active_poller_t &) = delete;
+
+    active_poller_t(active_poller_t &&src) = default;
+    active_poller_t &operator=(active_poller_t &&src) = default;
+
+    using handler_type = std::function<void(event_flags)>;
+
+    void add(zmq::socket_ref socket, event_flags events, handler_type handler)
+    {
+        const poller_ref_t ref{socket};
+
+        if (!handler)
+            throw std::invalid_argument("null handler in active_poller_t::add (socket)");
+        auto ret = handlers.emplace(
+          ref, std::make_shared<handler_type>(std::move(handler)));
+        if (!ret.second)
+            throw error_t(EINVAL); // already added
+        try {
+            base_poller.add(socket, events, ret.first->second.get());
+            need_rebuild = true;
+        }
+        catch (...) {
+            // rollback
+            handlers.erase(ref);
+            throw;
+        }
+    }
+
+    void add(fd_t fd, event_flags events, handler_type handler)
+    {
+        const poller_ref_t ref{fd};
+
+        if (!handler)
+            throw std::invalid_argument("null handler in active_poller_t::add (fd)");
+        auto ret = handlers.emplace(
+          ref, std::make_shared<handler_type>(std::move(handler)));
+        if (!ret.second)
+            throw error_t(EINVAL); // already added
+        try {
+            base_poller.add(fd, events, ret.first->second.get());
+            need_rebuild = true;
+        }
+        catch (...) {
+            // rollback
+            handlers.erase(ref);
+            throw;
+        }
+    }
+
+    void remove(zmq::socket_ref socket)
+    {
+        base_poller.remove(socket);
+        handlers.erase(socket);
+        need_rebuild = true;
+    }
+
+    void remove(fd_t fd)
+    {
+        base_poller.remove(fd);
+        handlers.erase(fd);
+        need_rebuild = true;
+    }
+
+    void modify(zmq::socket_ref socket, event_flags events)
+    {
+        base_poller.modify(socket, events);
+    }
+
+    void modify(fd_t fd, event_flags events)
+    {
+        base_poller.modify(fd, events);
+    }
+
+    size_t wait(std::chrono::milliseconds timeout)
+    {
+        if (need_rebuild) {
+            poller_events.resize(handlers.size());
+            poller_handlers.clear();
+            poller_handlers.reserve(handlers.size());
+            for (const auto &handler : handlers) {
+                poller_handlers.push_back(handler.second);
+            }
+            need_rebuild = false;
+        }
+        const auto count = base_poller.wait_all(poller_events, timeout);
+        std::for_each(poller_events.begin(),
+                      poller_events.begin() + static_cast<ptrdiff_t>(count),
+                      [](decltype(base_poller)::event_type &event) {
+                          assert(event.user_data != nullptr);
+                          (*event.user_data)(event.events);
+                      });
+        return count;
+    }
+
+    ZMQ_NODISCARD bool empty() const noexcept { return handlers.empty(); }
+
+    size_t size() const noexcept { return handlers.size(); }
+
+  private:
+    bool need_rebuild{false};
+
+    poller_t<handler_type> base_poller{};
+
+    std::unordered_map<zmq::poller_ref_t, std::shared_ptr<handler_type>> handlers{};
+
+    std::vector<decltype(base_poller)::event_type> poller_events{};
+    std::vector<std::shared_ptr<handler_type>> poller_handlers{};
+};     // class active_poller_t
+#endif //  defined(ZMQ_BUILD_DRAFT_API) && defined(ZMQ_CPP11) && defined(ZMQ_HAVE_POLLER)
+
+
+} // namespace zmq
+
+#endif // __ZMQ_ADDON_HPP_INCLUDED__
--- a/src/llama.cpp
+++ b/src/llama.cpp