From a95631ee97bb24861af6bdeec380270459631e8e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 26 Jun 2024 19:26:13 +0300 Subject: [PATCH 01/29] readme : update API notes --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 6ca5ba43e..99b16f6e2 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ### Recent API changes +- [2024 Jun 26] The source code and CMake build scripts have been restructured https://github.com/ggerganov/llama.cpp/pull/8006 - [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807 - [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341 - [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122 From 0e814dfc42b4b57ad19598d239557b6a977ca16c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 26 Jun 2024 19:32:07 +0300 Subject: [PATCH 02/29] devops : remove clblast + LLAMA_CUDA -> GGML_CUDA (#8139) ggml-ci --- .devops/full-cuda.Dockerfile | 2 +- .devops/full-rocm.Dockerfile | 2 +- .devops/llama-cli-cuda.Dockerfile | 2 +- .devops/llama-cli-intel.Dockerfile | 10 +-- .devops/llama-cli-rocm.Dockerfile | 2 +- .devops/llama-cli-vulkan.Dockerfile | 2 +- .devops/llama-cpp-clblast.srpm.spec | 84 -------------------------- .devops/llama-cpp-cuda.srpm.spec | 2 +- .devops/llama-server-cuda.Dockerfile | 2 +- .devops/llama-server-intel.Dockerfile | 10 +-- .devops/llama-server-rocm.Dockerfile | 2 +- .devops/llama-server-vulkan.Dockerfile | 2 +- 12 files changed, 19 insertions(+), 103 deletions(-) delete mode 100644 .devops/llama-cpp-clblast.srpm.spec diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile index f6073f662..2a7da586a 100644 --- a/.devops/full-cuda.Dockerfile +++ b/.devops/full-cuda.Dockerfile @@ -27,7 +27,7 @@ COPY . . # Set nvcc architecture ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} # Enable CUDA -ENV LLAMA_CUDA=1 +ENV GGML_CUDA=1 # Enable cURL ENV LLAMA_CURL=1 diff --git a/.devops/full-rocm.Dockerfile b/.devops/full-rocm.Dockerfile index 0314d469b..5cbd2e7a1 100644 --- a/.devops/full-rocm.Dockerfile +++ b/.devops/full-rocm.Dockerfile @@ -36,7 +36,7 @@ COPY . . # Set nvcc architecture ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} # Enable ROCm -ENV LLAMA_HIPBLAS=1 +ENV GGML_HIPBLAS=1 ENV CC=/opt/rocm/llvm/bin/clang ENV CXX=/opt/rocm/llvm/bin/clang++ diff --git a/.devops/llama-cli-cuda.Dockerfile b/.devops/llama-cli-cuda.Dockerfile index d5ce538f6..bff946cbc 100644 --- a/.devops/llama-cli-cuda.Dockerfile +++ b/.devops/llama-cli-cuda.Dockerfile @@ -21,7 +21,7 @@ COPY . . # Set nvcc architecture ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} # Enable CUDA -ENV LLAMA_CUDA=1 +ENV GGML_CUDA=1 RUN make -j$(nproc) llama-cli diff --git a/.devops/llama-cli-intel.Dockerfile b/.devops/llama-cli-intel.Dockerfile index 6789e17af..bd816f9f5 100644 --- a/.devops/llama-cli-intel.Dockerfile +++ b/.devops/llama-cli-intel.Dockerfile @@ -2,7 +2,7 @@ ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04 FROM intel/oneapi-basekit:$ONEAPI_VERSION as build -ARG LLAMA_SYCL_F16=OFF +ARG GGML_SYCL_F16=OFF RUN apt-get update && \ apt-get install -y git @@ -10,11 +10,11 @@ WORKDIR /app COPY . . -RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \ - echo "LLAMA_SYCL_F16 is set" && \ - export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \ +RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ + echo "GGML_SYCL_F16 is set" && \ + export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ fi && \ - cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \ + cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \ cmake --build build --config Release --target llama-cli FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime diff --git a/.devops/llama-cli-rocm.Dockerfile b/.devops/llama-cli-rocm.Dockerfile index 7e8a6f0fa..caa507b08 100644 --- a/.devops/llama-cli-rocm.Dockerfile +++ b/.devops/llama-cli-rocm.Dockerfile @@ -36,7 +36,7 @@ COPY . . # Set nvcc architecture ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} # Enable ROCm -ENV LLAMA_HIPBLAS=1 +ENV GGML_HIPBLAS=1 ENV CC=/opt/rocm/llvm/bin/clang ENV CXX=/opt/rocm/llvm/bin/clang++ diff --git a/.devops/llama-cli-vulkan.Dockerfile b/.devops/llama-cli-vulkan.Dockerfile index 7a0abe71f..6155d5881 100644 --- a/.devops/llama-cli-vulkan.Dockerfile +++ b/.devops/llama-cli-vulkan.Dockerfile @@ -14,7 +14,7 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key # Build it WORKDIR /app COPY . . -RUN cmake -B build -DLLAMA_VULKAN=1 && \ +RUN cmake -B build -DGGML_VULKAN=1 && \ cmake --build build --config Release --target llama-cli # Clean up diff --git a/.devops/llama-cpp-clblast.srpm.spec b/.devops/llama-cpp-clblast.srpm.spec deleted file mode 100644 index 013952191..000000000 --- a/.devops/llama-cpp-clblast.srpm.spec +++ /dev/null @@ -1,84 +0,0 @@ -# SRPM for building from source and packaging an RPM for RPM-based distros. -# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages -# Built and maintained by John Boero - boeroboy@gmail.com -# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal - -# Notes for llama.cpp: -# 1. Tags are currently based on hash - which will not sort asciibetically. -# We need to declare standard versioning if people want to sort latest releases. -# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies. -# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed. -# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo -# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries. -# It is up to the user to install the correct vendor-specific support. - -Name: llama.cpp-clblast -Version: %( date "+%%Y%%m%%d" ) -Release: 1%{?dist} -Summary: OpenCL Inference of LLaMA model in C/C++ -License: MIT -Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz -BuildRequires: coreutils make gcc-c++ git mesa-libOpenCL-devel clblast-devel -Requires: clblast -URL: https://github.com/ggerganov/llama.cpp - -%define debug_package %{nil} -%define source_date_epoch_from_changelog 0 - -%description -CPU inference for Meta's Lllama2 models using default options. - -%prep -%setup -n llama.cpp-master - -%build -make -j LLAMA_CLBLAST=1 - -%install -mkdir -p %{buildroot}%{_bindir}/ -cp -p llama-cli %{buildroot}%{_bindir}/llama-clblast-cli -cp -p llama-server %{buildroot}%{_bindir}/llama-clblast-server -cp -p llama-simple %{buildroot}%{_bindir}/llama-clblast-simple - -mkdir -p %{buildroot}/usr/lib/systemd/system -%{__cat} < %{buildroot}/usr/lib/systemd/system/llamaclblast.service -[Unit] -Description=Llama.cpp server, CPU only (no GPU support in this build). -After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target - -[Service] -Type=simple -EnvironmentFile=/etc/sysconfig/llama -ExecStart=/usr/bin/llama-clblast-server $LLAMA_ARGS -ExecReload=/bin/kill -s HUP $MAINPID -Restart=never - -[Install] -WantedBy=default.target -EOF - -mkdir -p %{buildroot}/etc/sysconfig -%{__cat} < %{buildroot}/etc/sysconfig/llama -LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin" -EOF - -%clean -rm -rf %{buildroot} -rm -rf %{_builddir}/* - -%files -%{_bindir}/llama-clblast-cli -%{_bindir}/llama-clblast-server -%{_bindir}/llama-clblast-simple -/usr/lib/systemd/system/llamaclblast.service -%config /etc/sysconfig/llama - - -%pre - -%post - -%preun -%postun - -%changelog diff --git a/.devops/llama-cpp-cuda.srpm.spec b/.devops/llama-cpp-cuda.srpm.spec index cbdf43626..7425d3a9d 100644 --- a/.devops/llama-cpp-cuda.srpm.spec +++ b/.devops/llama-cpp-cuda.srpm.spec @@ -32,7 +32,7 @@ CPU inference for Meta's Lllama2 models using default options. %setup -n llama.cpp-master %build -make -j LLAMA_CUDA=1 +make -j GGML_CUDA=1 %install mkdir -p %{buildroot}%{_bindir}/ diff --git a/.devops/llama-server-cuda.Dockerfile b/.devops/llama-server-cuda.Dockerfile index 7bef07a05..d7eaa0925 100644 --- a/.devops/llama-server-cuda.Dockerfile +++ b/.devops/llama-server-cuda.Dockerfile @@ -21,7 +21,7 @@ COPY . . # Set nvcc architecture ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} # Enable CUDA -ENV LLAMA_CUDA=1 +ENV GGML_CUDA=1 # Enable cURL ENV LLAMA_CURL=1 diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile index 3bf1670ec..8f8fef8c0 100644 --- a/.devops/llama-server-intel.Dockerfile +++ b/.devops/llama-server-intel.Dockerfile @@ -2,7 +2,7 @@ ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04 FROM intel/oneapi-basekit:$ONEAPI_VERSION as build -ARG LLAMA_SYCL_F16=OFF +ARG GGML_SYCL_F16=OFF RUN apt-get update && \ apt-get install -y git libcurl4-openssl-dev @@ -10,11 +10,11 @@ WORKDIR /app COPY . . -RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \ - echo "LLAMA_SYCL_F16 is set" && \ - export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \ +RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ + echo "GGML_SYCL_F16 is set" && \ + export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ fi && \ - cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ + cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ cmake --build build --config Release --target llama-server FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime diff --git a/.devops/llama-server-rocm.Dockerfile b/.devops/llama-server-rocm.Dockerfile index 4b1cdc320..af96c3325 100644 --- a/.devops/llama-server-rocm.Dockerfile +++ b/.devops/llama-server-rocm.Dockerfile @@ -36,7 +36,7 @@ COPY . . # Set nvcc architecture ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} # Enable ROCm -ENV LLAMA_HIPBLAS=1 +ENV GGML_HIPBLAS=1 ENV CC=/opt/rocm/llvm/bin/clang ENV CXX=/opt/rocm/llvm/bin/clang++ diff --git a/.devops/llama-server-vulkan.Dockerfile b/.devops/llama-server-vulkan.Dockerfile index 2bc2e45d3..49062f84b 100644 --- a/.devops/llama-server-vulkan.Dockerfile +++ b/.devops/llama-server-vulkan.Dockerfile @@ -14,7 +14,7 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key # Build it WORKDIR /app COPY . . -RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \ +RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \ cmake --build build --config Release --target llama-server # Clean up From 4713bf3093d58a3e12368ab2ab5fc3630f27803e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 26 Jun 2024 19:36:44 +0300 Subject: [PATCH 03/29] authors : regen --- AUTHORS | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 128 insertions(+), 1 deletion(-) diff --git a/AUTHORS b/AUTHORS index b029f13da..1bd36158a 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,8 +1,9 @@ -# date: Tue Apr 9 09:17:14 EEST 2024 +# date: Wed Jun 26 19:36:34 EEST 2024 # this file is auto-generated by scripts/gen-authors.sh 0cc4m 0xspringtime <110655352+0xspringtime@users.noreply.github.com> +20kdc 2f38b454 3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com> 44670 <44670@users.noreply.github.com> @@ -11,14 +12,18 @@ AT Aarni Koskela Aaron Miller Aaryaman Vasishta +Abheek Gulati Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Abhishek Gopinath K <31348521+overtunned@users.noreply.github.com> Adithya Balaji AdithyanI Adrian Adrian Hesketh +Ahmet Zeer AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> Aisuko +Akarshan Biswas +Albert Jin Alberto <57916483+albbus-stack@users.noreply.github.com> Alex Alex Azarov @@ -35,19 +40,24 @@ Ali Nehzat Ali Tariq Alon AlpinDale <52078762+AlpinDale@users.noreply.github.com> +Amir AmirAli Mirian <37371367+amiralimi@users.noreply.github.com> Ananta Bastola Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> András Salamon Andrei Andrew Canis +Andrew Downing Andrew Duffy Andrew Godfrey +Andy Tai Arik Poznanski Artem +Artem Zinnatullin Artyom Lebedev Asbjørn Olling Ásgeir Bjarni Ingvarsson +Ashish <1856117+ashishdatta@users.noreply.github.com> Ashok Gelal <401055+ashokgelal@users.noreply.github.com> Ashraful Islam Atsushi Tatsuma @@ -57,35 +67,46 @@ BADR Bach Le Bailey Chittle <39804642+bachittle@users.noreply.github.com> BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com> +Bartowski Behnam M <58621210+ibehnam@users.noreply.github.com> +Ben Ashbaugh Ben Garney Ben Siraphob Ben Williams +Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com> Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com> Bernat Vadell +Bingan <70050083+binganao@users.noreply.github.com> Bodo Graumann Bono Lv Borislav Stanimirov Branden Butler Brian Bruce MacDonald +Bryan Honof CJ Pais CRD716 +Calvin Laurenson Cameron Cameron Kaiser +Carolinabanana <140120812+Carolinabanana@users.noreply.github.com> Casey Primozic Casey Primozic CausalLM <148736309+CausalLM@users.noreply.github.com> Cebtenzzre Chad Brewbaker +Chao Jiang Cheng Shao +Chris Elrod Chris Kuehl Christian Demsar Christian Demsar Christian Falch <875252+chrfalch@users.noreply.github.com> Christian Kögler +Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> Clark Saben <76020733+csaben@users.noreply.github.com> Clint Herron +CrispStrobe <154636388+CrispStrobe@users.noreply.github.com> Cuong Trinh Manh DAN™ Damian Stewart @@ -95,8 +116,12 @@ Daniel Bevenius Daniel Drake Daniel Hiltgen Daniel Illescas Romero +Daniele <57776841+daniandtheweb@users.noreply.github.com> DannyDaemonic Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com> +Dave +Dave Airlie +Dave Airlie Dave Della Costa David Friehs David Kennedy @@ -104,10 +129,13 @@ David Pflug David Renshaw David Sommers <12738+databyte@users.noreply.github.com> David Yang +Dawid Potocki Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com> Dean Deins +Deven Mistry <31466137+deven367@users.noreply.github.com> Didzis Gosko +Djip007 Don Mahurin DooWoong Lee (David) Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com> @@ -116,8 +144,11 @@ Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com> Ebey Abraham Ed Lee Ed Lepedus +Eddie-Wang Edward Taylor +Elaine Elbios <141279586+Elbios@users.noreply.github.com> +Elton Kola Engininja2 <139037756+Engininja2@users.noreply.github.com> Equim Eric Sommerlade @@ -143,37 +174,47 @@ Firat Folko-Ven <71110216+Folko-Ven@users.noreply.github.com> Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com> Francisco Melo <43780565+francis2tm@users.noreply.github.com> +Frank Mai FrankHB +Fred Douglas <43351173+fredlas@users.noreply.github.com> Frederik Vogel Gabe Goodhart GainLee Galunid Gary Linscott Gary Mulder +Gavin Zhao Genkagaku.GPT Georgi Gerganov Gilad S +Giuseppe Scrivano GiviMAD Govlzkoy Guillaume "Vermeille" Sanchez Guillaume Wenzek Guoteng <32697156+SolenoidWGT@users.noreply.github.com> Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com> +Haggai Nuchi Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com> +Hamdoud Hakem <90524568+hamdoudhakem@users.noreply.github.com> +HanishKVC Haohui Mai Haoxiang Fei Harald Fernengel Hatsune Miku <129688334+at8u@users.noreply.github.com> +HatsuneMikuUwU33 <173229399+HatsuneMikuUwU33@users.noreply.github.com> Henk Poley Henri Vasserman Henrik Forstén Herman Semenov Hesen Peng Hoang Nguyen +Hong Bo PENG Hongyu Ouyang <96765450+casavaca@users.noreply.github.com> Howard Su Hua Jiang Huawei Lin +Hugo Roussel Ian Bull Ian Bull Ian Scrivener @@ -190,8 +231,10 @@ Ivan Stepanov JH23X <165871467+JH23X@users.noreply.github.com> Jack Mousseau JackJollimore <130917767+JackJollimore@users.noreply.github.com> +Jaemin Son Jag Chadha Jakub N +James A Capozzoli <157492257+jac-jim@users.noreply.github.com> James Reynolds Jan Boon Jan Boon @@ -205,12 +248,17 @@ Jean-Michaël Celerier Jed Fox Jeffrey Quesnelle Jesse Jojo Johnson +Jeximo Jhen-Jie Hong Jiahao Li Jian Liao JidongZhang-THU <1119708529@qq.com> Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com> Jiří Podivín <66251151+jpodivin@users.noreply.github.com> +Jiří Sejkora +Joan Fontanals +Joan Fontanals +Johan Johannes Gäßler Johannes Rudolph John <78893154+cmp-nct@users.noreply.github.com> @@ -221,15 +269,19 @@ Jonas Wunderlich <32615971+jonas-w@users.noreply.github.com> Jorge A <161275481+jorgealias@users.noreply.github.com> Jose Maldonado <63384398+yukiteruamano@users.noreply.github.com> Joseph Stahl <1269177+josephst@users.noreply.github.com> +Josh Ramer Joyce Juan Calderon-Perez <835733+gaby@users.noreply.github.com> Judd Julius Arkenberg Jun Jie <71215065+junnjiee16@users.noreply.github.com> +Junyang Lin Juraj Bednar Justin Parker Justin Suess +Justina Cho Justine Tunney +Justine Tunney Juuso Alasuutari KASR Kamil Tomšík @@ -242,6 +294,7 @@ Kawrakow <48489457+ikawrakow@users.noreply.github.com> Keiichi Tabata Kenvix ⭐ Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> +Kevin Gibbons Kevin Ji <1146876+kevinji@users.noreply.github.com> Kevin Kwok Kevin Lo @@ -257,6 +310,7 @@ Laura Lee <44310445+lx200916@users.noreply.github.com> Lee Drake Leng Yue +Leon Knauer LeonEricsson <70749762+LeonEricsson@users.noreply.github.com> Leonardo Neumann Li Tan @@ -265,20 +319,26 @@ LoganDark LostRuins <39025047+LostRuins@users.noreply.github.com> Luciano Luo Tian +Lyle Dean M. Yusuf Sarıgöz Maarten ter Huurne Mack Straight Maël Kerbiriou MaggotHATE +Manuel <44313466+makuche@users.noreply.github.com> Marc Köhlbrugge Marco Matthies <71844+marcom@users.noreply.github.com> Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com> Marian Cepok Mark Fairbairn Marko Tasic +Markus Tavenrath +Martin Delille Martin Krasser Martin Schwaighofer Marvin Gießing +Masaya, Kato <62578291+msy-kato@users.noreply.github.com> +MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com> Mateusz Charytoniuk Matheus C. França Matheus Gabriel Alves Silva @@ -287,8 +347,11 @@ Mathijs de Bruin Matt Clayton <156335168+mattjcly@users.noreply.github.com> Matt Pulver Matteo Boschini <12133566+mbosc@users.noreply.github.com> +Mattheus Chediak Matthew Tejo Matvey Soloviev +Max Krasnyansky +Max Krasnyansky Maxime <672982+maximegmd@users.noreply.github.com> Maximilian Winter Meng Zhang @@ -300,32 +363,41 @@ Michael Kesper Michael Klimenko Michael Podvitskiy Michael Potter +Michael de Gans Michaël de Vries Mihai Mike +Mikko Juola Minsoo Cheong <54794500+mscheong01@users.noreply.github.com> Mirko185 Mirror Azure <54669636+MirrorAzure@users.noreply.github.com> Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com> Mohammadreza Hendiani +Mohammadreza Hendiani Murilo Santana Musab Gultekin Nam D. Tran <42194884+namtranase@users.noreply.github.com> +Nathan Epstein NawafAlansari <72708095+NawafAlansari@users.noreply.github.com> Nebula +Neo Zhang <14088817+arthw@users.noreply.github.com> +Neo Zhang Neo Zhang Jianyu Neuman Vong Nexesenex <124105151+Nexesenex@users.noreply.github.com> Niall Coates <1349685+Niall-@users.noreply.github.com> Nicolai Weitkemper +Nicolás Pérez Nigel Bosch Niklas Korz +Nikolas <127742645+nneubacher@users.noreply.github.com> Nindaleth Oleksandr Nikitin Oleksii Maryshchenko Olivier Chafik Ondřej Čertík Ouadie EL FAROUKI +Patrice Ferlet Paul Tsochantaris Pavol Rusnak Pedro Cuenca @@ -343,9 +415,14 @@ RJ Adriaansen Radoslav Gerganov Radosław Gryta Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com> +Raj Hammeer Singh Hada +Ralph Soika Rand Xie Randall Fitzgerald Reinforce-II +Ren Xuancheng +Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com> +RhinoDevel Riceball LEE Richard Kiss Richard Roberson @@ -373,6 +450,7 @@ Rowan Hart Rune <43761327+Rune-AI@users.noreply.github.com> Ryan Landay Ryder Wishart +Ryuei Rőczey Barnabás <31726601+An0nie@users.noreply.github.com> SakuraUmi Salvador E. Tropea @@ -386,6 +464,7 @@ SebastianApel <13675545+SebastianApel@users.noreply.github.com> Senemu <10880819+Senemu@users.noreply.github.com> Sergey Alirzaev Sergio López +Sertaç Özercan <852750+sozercan@users.noreply.github.com> SeungWon Jeong <65549245+redlion0929@users.noreply.github.com> ShadovvBeast Shakhar Dasgupta @@ -394,6 +473,7 @@ Shijie <821898965@qq.com> Shintarou Okada Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com> Shouzheng Liu +Shuichi Tsutsumi Sigbjørn Skjæret Simon Willison Siwen Yu @@ -405,11 +485,14 @@ Someone Someone Serge Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com> Spencer Sutton +Srihari-mcw <96763064+Srihari-mcw@users.noreply.github.com> Srinivas Billa Stefan Sydow +Steffen Röcker Stephan Walter Stephen Nichols Steve Grubb +Steven Prichard Steven Roussey Steward Garcia <57494570+FSSRepo@users.noreply.github.com> Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com> @@ -434,16 +517,19 @@ Tom C Tom Jobbins <784313+TheBloke@users.noreply.github.com> Tomas Tomáš Pazdiora +Tristan Druyen Tristan Ross Tungsten842 <886724vf@anonaddy.me> Tungsten842 Tushar UEXTM.com <84163508+uextm@users.noreply.github.com> +Ulrich Drepper Uzo Nweke Vaibhav Srivastav Val Kharitonov Valentin Konovalov Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com> +Victor Nogueira Victor Z. Peng Vlad Vladimir @@ -455,7 +541,9 @@ Weird Constructor Welby Seely Wentai Zhang WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com> +William Tambellini Willy Tarreau +Wouter <9594229+DifferentialityDevelopment@users.noreply.github.com> Wu Jian Ping Wu Jian Ping Xiake Sun @@ -466,6 +554,8 @@ Xiaoyi Chen Xingchen Song(宋星辰) Xuan Son Nguyen Yann Follet <131855179+YannFollet@users.noreply.github.com> +Yaroslav +Yazan Agha-Schrader Yiming Cui Yishuo Wang Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com> @@ -477,6 +567,7 @@ Zane Shannon Zay <95888118+isaiahbjork@users.noreply.github.com> Zenix Zhang Peiyuan +Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com> ZhouYuChen Ziad Ben Hadj-Alouane Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com> @@ -484,14 +575,18 @@ Zsapi a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com> adel boussaken afrideva <95653597+afrideva@users.noreply.github.com> +agray3 akawrykow <142945436+akawrykow@users.noreply.github.com> alexpinel <93524949+alexpinel@users.noreply.github.com> alonfaraj +alwqx +amd-lalithnc andrijdavid anon998 <131767832+anon998@users.noreply.github.com> anzz1 apaz apcameron <37645737+apcameron@users.noreply.github.com> +arch-btw <57669023+arch-btw@users.noreply.github.com> arcrank arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com> at8u <129688334+at8u@users.noreply.github.com> @@ -514,13 +609,17 @@ cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com> coezbek comex compilade <113953597+compilade@users.noreply.github.com> +compilade +cpumaxx <163466046+cpumaxx@users.noreply.github.com> crasm crasm daboe01 david raistrick +ddh0 ddpasa <112642920+ddpasa@users.noreply.github.com> deepdiffuser <112834445+deepdiffuser@users.noreply.github.com> divinity76 +dm4 dotpy314 <33351922+dotpy314@users.noreply.github.com> drbh ds5t5 <145942675+ds5t5@users.noreply.github.com> @@ -529,6 +628,7 @@ eastriver ebraminio eiery <19350831+eiery@users.noreply.github.com> eric8607242 +fairydreaming <166155368+fairydreaming@users.noreply.github.com> fraxy-v <65565042+fraxy-v@users.noreply.github.com> github-actions[bot] gliptic @@ -539,6 +639,7 @@ h-h-h-h <13482553+h-h-h-h@users.noreply.github.com> hankcs hoangmit hongbo.mo <352280764@qq.com> +hopkins385 <98618192+hopkins385@users.noreply.github.com> howlger howlger hutli <6594598+hutli@users.noreply.github.com> @@ -549,14 +650,22 @@ hydai iSma iacore <74560659+iacore@users.noreply.github.com> igarnier +intelmatt <61025942+intelmatt@users.noreply.github.com> iohub jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com> +jaime-m-p <167997752+jaime-m-p@users.noreply.github.com> jameswu2014 <545426914@qq.com> +jiez <373447296@qq.com> jneem +joecryptotoo <80373433+joecryptotoo@users.noreply.github.com> johnson442 <56517414+johnson442@users.noreply.github.com> +jojorne jon-chuang <9093549+jon-chuang@users.noreply.github.com> jp-x-g +jukofyork <69222624+jukofyork@users.noreply.github.com> +junchao-loongson <68935141+junchao-loongson@users.noreply.github.com> jwj7140 <32943891+jwj7140@users.noreply.github.com> +k.h.lai kaizau kalomaze <66376113+kalomaze@users.noreply.github.com> kang @@ -575,11 +684,15 @@ ldwang le.chang leejet limitedAtonement +liuwei-git <14815172+liuwei-git@users.noreply.github.com> lon <114724657+longregen@users.noreply.github.com> +loonerin <132926317+loonerin@users.noreply.github.com> +luoyu-intel m3ndax maddes8cht <55592906+maddes8cht@users.noreply.github.com> makomk manikbhandari +maor-ps <154728172+maor-ps@users.noreply.github.com> mdrokz mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com> minarchist @@ -593,15 +706,19 @@ ngc92 <7938269+ngc92@users.noreply.github.com> nhamanasu <45545786+nhamanasu@users.noreply.github.com> niansa/tuxifan niansa/tuxifan +nickp27 ningshanwutuobang nold nopperl <54780682+nopperl@users.noreply.github.com> nusu-github <29514220+nusu-github@users.noreply.github.com> olexiyb +omahs <73983677+omahs@users.noreply.github.com> oobabooga <112222186+oobabooga@users.noreply.github.com> opparco ostix360 <55257054+ostix360@users.noreply.github.com> +pengxin99 perserk +pmysl postmasters pudepiedj qingfengfenga <41416092+qingfengfenga@users.noreply.github.com> @@ -614,16 +731,19 @@ rhuddleston rimoliga <53384203+rimoliga@users.noreply.github.com> runfuture sandyiscool +sasha0552 semidark sharpHL <132747147+sharpHL@users.noreply.github.com> shibe2 singularity <12184989+singularity-s0@users.noreply.github.com> sjinzh +sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com> slaren <2141330+slaren@users.noreply.github.com> slaren snadampal <87143774+snadampal@users.noreply.github.com> staviq stduhpf +strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com> swittk takov751 <40316768+takov751@users.noreply.github.com> tarcey @@ -636,12 +756,16 @@ uint256_t uint256_t unbounded valiray <133289098+valiray@users.noreply.github.com> +vik +viric vodkaslime <646329483@qq.com> vvhg1 <94630311+vvhg1@users.noreply.github.com> vxiiduu <73044267+vxiiduu@users.noreply.github.com> wbpxre150 <100937007+wbpxre150@users.noreply.github.com> whoreson <139810751+whoreson@users.noreply.github.com> +woachk <24752637+woachk@users.noreply.github.com> wonjun Jang +woodx <124784234+woodx9@users.noreply.github.com> wzy <32936898+Freed-Wu@users.noreply.github.com> xaedes xaedes @@ -649,7 +773,10 @@ xloem <0xloem@gmail.com> yangli2 yuiseki zakkor +zhangkaihuo zhouwg <6889919+zhouwg@users.noreply.github.com> +zhouwg zrm +Ștefan-Gabriel Muscalu 源文雨 <41315874+fumiama@users.noreply.github.com> Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com> From f2d48fffde76d959fdb0da37316bdc09e5518eb1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 26 Jun 2024 19:39:19 +0300 Subject: [PATCH 04/29] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index b6c57ec5e..2da33e913 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -5653a195935ea3ac54652644c9daf154dbc1571b +5378ea0d3c2f25bcd330ecb226ad2db454be86d0 From c7ab7b612cbdce04499575e713076a026af4b9c5 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 26 Jun 2024 20:20:22 +0200 Subject: [PATCH 05/29] make : fix missing -O3 (#8143) --- Makefile | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 64a6e6ff0..bbfe0f12b 100644 --- a/Makefile +++ b/Makefile @@ -148,12 +148,6 @@ ifndef UNAME_M UNAME_M := $(shell uname -m) endif -MK_CFLAGS += -O3 -MK_CXXFLAGS += -O3 -ifndef LLAMA_DEBUG -MK_NVCCFLAGS += -O3 -endif # LLAMA_DEBUG - # In GNU make default CXX is g++ instead of c++. Let's fix that so that users # of non-gcc compilers don't have to provide g++ alias or wrapper. DEFCC := cc @@ -312,7 +306,10 @@ ifdef LLAMA_DEBUG MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS endif else - MK_CPPFLAGS += -DNDEBUG + MK_CPPFLAGS += -DNDEBUG + MK_CFLAGS += -O3 + MK_CXXFLAGS += -O3 + MK_NVCCFLAGS += -O3 endif ifdef LLAMA_SANITIZE_THREAD From 31ec3993f6e050322a249c07af79dbde66ea6ddc Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 26 Jun 2024 21:34:14 +0200 Subject: [PATCH 06/29] ggml : add GGML_CUDA_USE_GRAPHS option, restore GGML_CUDA_FORCE_CUBLAS (cmake) (#8140) --- CMakeLists.txt | 1 + ggml/CMakeLists.txt | 2 ++ ggml/src/CMakeLists.txt | 5 ++++- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 18297834e..7a7197282 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -80,6 +80,7 @@ set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED}) set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS}) set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS}) set(GGML_LLAMAFILE ON) +set(GGML_CUDA_USE_GRAPHS ON) # transition helpers function (llama_option_depr TYPE OLD NEW) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index f3763f7eb..0d0d52d57 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -109,6 +109,7 @@ option(GGML_LLAMAFILE "ggml: use ggml SGEMM" option(GGML_CUDA "ggml: use CUDA" OFF) option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF) option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF) +option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF) set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels") set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels") option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF) @@ -119,6 +120,7 @@ set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF) option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF) option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF) +option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" OFF) option(GGML_CURL "ggml: use libcurl to download model from an URL" OFF) option(GGML_HIPBLAS "ggml: use hipBLAS" OFF) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index ba341d374..d0f4097d8 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -295,12 +295,15 @@ if (GGML_CUDA) list(APPEND GGML_CDEF_PUBLIC GGML_USE_CUDA) - add_compile_definitions(GGML_CUDA_USE_GRAPHS) add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y}) add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER}) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) + if (GGML_CUDA_USE_GRAPHS) + add_compile_definitions(GGML_CUDA_USE_GRAPHS) + endif() + if (GGML_CUDA_FORCE_DMMV) add_compile_definitions(GGML_CUDA_FORCE_DMMV) endif() From ae5d0f4b899ff2842bfca561370c945ad8d4368b Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 26 Jun 2024 21:59:28 +0200 Subject: [PATCH 07/29] ci : publish new docker images only when the files change (#8142) --- .github/workflows/build.yml | 4 ++-- .github/workflows/docker.yml | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0d91fc4e4..208515287 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,10 +10,10 @@ on: push: branches: - master - paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m'] + paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal'] pull_request: types: [opened, synchronize, reopened] - paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m'] + paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal'] concurrency: group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 01f1a4522..bf94b2024 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -14,6 +14,7 @@ on: push: branches: - master + paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal'] concurrency: group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} From c70d117c37cc7876e775d1e2722208a50c52edb3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 26 Jun 2024 23:25:22 +0300 Subject: [PATCH 08/29] scripts : fix filename sync --- scripts/sync-ggml-am.sh | 71 ++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 36 deletions(-) diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index 9e654180b..b05a33747 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -136,42 +136,41 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # LICENSE -> LICENSE # scripts/gen-authors.sh -> scripts/gen-authors.sh - cat ggml-src.patch | sed \ - -e 's/CMakeLists.txt/ggml\/CMakeLists.txt/g' \ - -e 's/src\/CMakeLists.txt/ggml\/src\/CMakeLists.txt/g' \ - -e 's/cmake\/FindSIMD.cmake/ggml\/cmake\/FindSIMD.cmake/g' \ - -e 's/src\/ggml\.c/ggml/src/ggml.c/g' \ - -e 's/src\/ggml-alloc\.c/ggml/src/ggml-alloc.c/g' \ - -e 's/src\/ggml-backend-impl\.h/ggml/src/ggml-backend-impl.h/g' \ - -e 's/src\/ggml-backend\.c/ggml/src/ggml-backend.c/g' \ - -e 's/src\/ggml-common\.h/ggml/src/ggml-common.h/g' \ - -e 's/src\/ggml-cuda\//ggml-cuda\//g' \ - -e 's/src\/ggml-cuda\.cu/ggml/src/ggml-cuda.cu/g' \ - -e 's/src\/ggml-impl\.h/ggml/src/ggml-impl.h/g' \ - -e 's/src\/ggml-kompute\.cpp/ggml/src/ggml-kompute.cpp/g' \ - -e 's/src\/ggml-metal\.m/ggml/src/ggml-metal.m/g' \ - -e 's/src\/ggml-quants\.c/ggml/src/ggml-quants.c/g' \ - -e 's/src\/ggml-quants\.h/ggml/src/ggml-quants.h/g' \ - -e 's/src\/ggml-rpc\.cpp/ggml/src/ggml-rpc.cpp/g' \ - -e 's/src\/ggml-sycl\.cpp/ggml/src/ggml-sycl.cpp/g' \ - -e 's/src\/ggml-vulkan\.cpp/ggml/src/ggml-vulkan.cpp/g' \ - -e 's/include\/ggml\.h/ggml/include/ggml.h/g' \ - -e 's/include\/ggml-alloc\.h/ggml/include/ggml-alloc.h/g' \ - -e 's/include\/ggml-backend\.h/ggml/include/ggml-backend.h/g' \ - -e 's/include\/ggml-blas\.h/ggml/include/ggml-blas.h/g' \ - -e 's/include\/ggml-cuda\.h/ggml/include/ggml-cuda.h/g' \ - -e 's/include\/ggml-kompute\.h/ggml/include/ggml-kompute.h/g' \ - -e 's/include\/ggml-metal\.h/ggml/include/ggml-metal.h/g' \ - -e 's/include\/ggml-rpc\.h/ggml/include/ggml-rpc.h/g' \ - -e 's/include\/ggml-sycl\.h/ggml/include/ggml-sycl.h/g' \ - -e 's/include\/ggml-vulkan\.h/ggml/include/ggml-vulkan.h/g' \ - -e 's/tests\/test-opt\.cpp/tests\/test-opt.cpp/g' \ - -e 's/tests\/test-grad0\.cpp/tests\/test-grad0.cpp/g' \ - -e 's/tests\/test-quantize-fns\.cpp/tests\/test-quantize-fns.cpp/g' \ - -e 's/tests\/test-quantize-perf\.cpp/tests\/test-quantize-perf.cpp/g' \ - -e 's/tests\/test-backend-ops\.cpp/tests\/test-backend-ops.cpp/g' \ - -e 's/LICENSE/LICENSE/g' \ - -e 's/scripts\/gen-authors\.sh/scripts\/gen-authors.sh/g' \ + cat ggml-src.patch | sed -E \ + -e 's/([[:space:]]|[ab]\/)CMakeLists.txt/\1ggml\/CMakeLists.txt/g' \ + -e 's/([[:space:]]|[ab]\/)src\/CMakeLists.txt/\1ggml\/src\/CMakeLists.txt/g' \ + -e 's/([[:space:]]|[ab]\/)cmake\/FindSIMD.cmake/\1ggml\/cmake\/FindSIMD.cmake/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml\.c/\1ggml\/src\/ggml.c/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-alloc\.c/\1ggml\/src\/ggml-alloc.c/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-backend-impl\.h/\1ggml\/src\/ggml-backend-impl.h/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-backend\.c/\1ggml\/src\/ggml-backend.c/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-common\.h/\1ggml\/src\/ggml-common.h/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\.cu/\1ggml\/src\/ggml-cuda.cu/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-impl\.h/\1ggml\/src\/ggml-impl.h/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-kompute\.cpp/\1ggml\/src\/ggml-kompute.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-metal\.m/\1ggml\/src\/ggml-metal.m/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.c/\1ggml\/src\/ggml-quants.c/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-quants\.h/\1ggml\/src\/ggml-quants.h/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-rpc\.cpp/\1ggml\/src\/ggml-rpc.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\.cpp/\1ggml\/src\/ggml-sycl.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-vulkan\.cpp/\1ggml\/src\/ggml-vulkan.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml\.h/\1ggml\/include\/ggml.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-alloc\.h/\1ggml\/include\/ggml-alloc.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-backend\.h/\1ggml\/include\/ggml-backend.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-blas\.h/\1ggml\/include\/ggml-blas.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-cuda\.h/\1ggml\/include\/ggml-cuda.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-kompute\.h/\1ggml\/include\/ggml-kompute.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-metal\.h/\1ggml\/include\/ggml-metal.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-rpc\.h/\1ggml\/include\/ggml-rpc.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-sycl\.h/\1ggml\/include\/ggml-sycl.h/g' \ + -e 's/([[:space:]]|[ab]\/)include\/ggml-vulkan\.h/\1ggml\/include\/ggml-vulkan.h/g' \ + -e 's/([[:space:]]|[ab]\/)examples\/common\.h/examples\/common.h/g' \ + -e 's/([[:space:]]|[ab]\/)examples\/common\.cpp/examples\/common.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.h/examples\/common-ggml.h/g' \ + -e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.cpp/examples\/common-ggml.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)LICENSE/LICENSE/g' \ + -e 's/([[:space:]]|[ab]\/)scripts\/gen-authors\.sh/scripts\/gen-authors.sh/g' \ > ggml-src.patch.tmp mv ggml-src.patch.tmp ggml-src.patch From 9b31a40c6ddabe552875b811d7127aa039ca9703 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 27 Jun 2024 01:50:09 +0200 Subject: [PATCH 09/29] clip : suppress unused variable warnings (#8105) * clip : suppress unused variable warnings This commit suppresses unused variable warnings for the variables e in the catch blocks. The motivation for this change is to suppress the warnings that are generated on Windows when using the MSVC compiler. The warnings are not displayed when using GCC because GCC will mark all catch parameters as used. Signed-off-by: Daniel Bevenius * squash! clip : suppress unused variable warnings Remove e (/*e*/) instead instead of using GGML_UNUSED. --------- Signed-off-by: Daniel Bevenius --- examples/llava/clip.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 95fbe3d02..d6882eec3 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1121,20 +1121,20 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } if (n < 32) hparams.image_grid_pinpoints[n] = 0; - } catch (std::runtime_error & e) { + } catch (std::runtime_error & /*e*/) { hparams.image_grid_pinpoints[0]=0; } try { int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE); strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx)); - } catch (std::runtime_error & e) { + } catch (std::runtime_error & /*e*/) { strcpy(hparams.mm_patch_merge_type, "flat"); } try { hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6 - } catch(const std::exception& e) { + } catch(const std::exception& /*e*/) { hparams.image_crop_resolution = hparams.image_size; } @@ -1173,7 +1173,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { try { vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD); new_clip->has_class_embedding = true; - } catch (const std::exception& e) { + } catch (const std::exception& /*e*/) { new_clip->has_class_embedding = false; } @@ -1181,7 +1181,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); new_clip->has_pre_norm = true; - } catch (std::exception & e) { + } catch (std::exception & /*e*/) { new_clip->has_pre_norm = false; } @@ -1189,21 +1189,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight")); vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias")); new_clip->has_post_norm = true; - } catch (std::exception & e) { + } catch (std::exception & /*e*/) { new_clip->has_post_norm = false; } try { vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS); new_clip->has_patch_bias = true; - } catch (std::exception & e) { + } catch (std::exception & /*e*/) { new_clip->has_patch_bias = false; } try { vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); - } catch(const std::exception& e) { + } catch(const std::exception& /*e*/) { LOG_TEE("%s: failed to load vision model tensors\n", __func__); } @@ -1215,26 +1215,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { // Yi-type llava vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight")); vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias")); - } catch (std::runtime_error & e) { } + } catch (std::runtime_error & /*e*/) { } try { // missing in Yi-type llava vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight")); vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias")); - } catch (std::runtime_error & e) { } + } catch (std::runtime_error & /*e*/) { } try { // Yi-type llava vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight")); vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias")); - } catch (std::runtime_error & e) { } + } catch (std::runtime_error & /*e*/) { } try { // Yi-type llava vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight")); vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias")); - } catch (std::runtime_error & e) { } + } catch (std::runtime_error & /*e*/) { } try { vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE); // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__); - } catch (std::runtime_error & e) { } + } catch (std::runtime_error & /*e*/) { } } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { // MobileVLM projection vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight")); From ac146628e47451c531a3c7e62e6a973a2bb467a0 Mon Sep 17 00:00:00 2001 From: Raj Hammeer Singh Hada Date: Thu, 27 Jun 2024 07:27:57 +0530 Subject: [PATCH 10/29] Fix llama-android.cpp for error - "common/common.h not found" (#8145) - Path seems to be wrong for the common.h header file in llama-android.cpp file. Fixing the path so the Android Build doesn't fail with the error "There is no file common/common.h" --- examples/llama.android/llama/src/main/cpp/llama-android.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp index 874158ef0..92a6b16b1 100644 --- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp @@ -5,7 +5,7 @@ #include #include #include "llama.h" -#include "common/common.h" +#include "common.h" // Write C++ code here. // From 911e35bb8bb2fd1c7d3f40f27e96ff432eae7e14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Thu, 27 Jun 2024 09:46:41 +0200 Subject: [PATCH 11/29] llama : fix CodeLlama FIM token checks (#8144) * account for space prefix character * use find instead --- src/llama.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index f78594a6f..080057332 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5152,10 +5152,10 @@ static void llm_load_vocab( if (gen_name.find("code") != std::string::npos) { if (model.arch == LLM_ARCH_LLAMA && 32010 < vocab.id_to_token.size() - && vocab.id_to_token[32007].text == "
"
-              && vocab.id_to_token[32008].text == ""
-              && vocab.id_to_token[32009].text == ""
-              && vocab.id_to_token[32010].text == "") {
+              && vocab.id_to_token[32007].text.find("
") != std::string::npos
+              && vocab.id_to_token[32008].text.find("") != std::string::npos
+              && vocab.id_to_token[32009].text.find("") != std::string::npos
+              && vocab.id_to_token[32010].text.find("") != std::string::npos) {
                 vocab.special_prefix_id = 32007;
                 vocab.special_suffix_id = 32008;
                 vocab.special_middle_id = 32009;

From f675b20a3b7f878bf3be766b9a737e2c8321ff0d Mon Sep 17 00:00:00 2001
From: kustaaya <58045274+kustaaya@users.noreply.github.com>
Date: Thu, 27 Jun 2024 11:58:54 +0300
Subject: [PATCH 12/29] Added support for Viking pre-tokenizer (#8135)

Co-authored-by: kustaaya 
---
 convert-hf-to-gguf-update.py | 1 +
 convert-hf-to-gguf.py        | 3 +++
 include/llama.h              | 1 +
 src/llama.cpp                | 9 +++++++++
 4 files changed, 14 insertions(+)

diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
index 67598b561..2758214fa 100755
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -85,6 +85,7 @@ models = [
     {"name": "smaug-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
     {"name": "poro-chat",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
     {"name": "jina-v2-code",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
+    {"name": "viking",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
 ]
 
 
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index c26fad930..5bf69ef9f 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -487,6 +487,9 @@ class Model:
         if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
             # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
             res = "jina-v2-code"
+        if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
+            # ref: https://huggingface.co/LumiOpen/Viking-7B
+            res = "viking"
 
         if res is None:
             logger.warning("\n")
diff --git a/include/llama.h b/include/llama.h
index 88eecb0ed..cafeafb85 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -88,6 +88,7 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_DBRX           = 13,
         LLAMA_VOCAB_PRE_TYPE_SMAUG          = 14,
         LLAMA_VOCAB_PRE_TYPE_PORO           = 15,
+        LLAMA_VOCAB_PRE_TYPE_VIKING         = 16,
     };
 
     // note: these values should be synchronized with ggml_rope
diff --git a/src/llama.cpp b/src/llama.cpp
index 080057332..b97b5e279 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -5067,6 +5067,9 @@ static void llm_load_vocab(
             } else if (
                 tokenizer_pre == "poro-chat") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
+            } else if (
+                tokenizer_pre == "viking") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
             } else {
                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
             }
@@ -13703,6 +13706,12 @@ struct llm_tokenizer_bpe {
                     " ?[^(\\s|.,!?…。,、।۔،)]+",
                 };
                 break;
+            case LLAMA_VOCAB_PRE_TYPE_VIKING:
+                regex_exprs = {
+                    "\\p{N}",
+                    " ?[^(\\s|.,!?…。,、।۔،)]+",
+                };
+                break;
             default:
                 // default regex for BPE tokenization pre-processing
                 regex_exprs = {

From 85a267daaa1c6f8fd69160445bcb88717031d10c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= 
Date: Thu, 27 Jun 2024 16:26:05 +0200
Subject: [PATCH 13/29] CUDA: fix MMQ stream-k for --split-mode row (#8167)

---
 ggml/src/ggml-cuda/mmq.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index 31fcbf139..1396e7a75 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -2475,7 +2475,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
 
     const dim3 block_nums_mmq(nsm, 1, 1);
 
-    ggml_cuda_pool & pool = ctx.pool();
+    ggml_cuda_pool & pool = ctx.pool(id);
     ggml_cuda_pool_alloc tmp_fixup(pool, block_nums_mmq.x * mmq_x*mmq_y);
 
     if (args.ne01 % mmq_y == 0) {

From 6030c61281c8a7eb94eceb7396a608fac8b71555 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= 
Date: Thu, 27 Jun 2024 16:27:41 +0200
Subject: [PATCH 14/29] Add Qwen2MoE 57B-A14B model identifier (#8158)

* Add Qwen2MoE 57B-A14B

* Add Qwen2MoE 57B-A14B
---
 src/llama.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/llama.cpp b/src/llama.cpp
index b97b5e279..3dc0f8535 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2038,6 +2038,7 @@ enum e_model {
     MODEL_8x22B,
     MODEL_16x12B,
     MODEL_10B_128x3_66B,
+    MODEL_57B_A14B,
 };
 
 static const size_t kiB = 1024;
@@ -4267,6 +4268,7 @@ static const char * llama_model_type_name(e_model type) {
         case MODEL_8x22B:         return "8x22B";
         case MODEL_16x12B:        return "16x12B";
         case MODEL_10B_128x3_66B: return "10B+128x3.66B";
+        case MODEL_57B_A14B:      return "57B.A14B";
         default:                  return "?B";
     }
 }
@@ -4588,6 +4590,7 @@ static void llm_load_hparams(
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                 switch (hparams.n_layer) {
                     case 24: model.type = e_model::MODEL_A2_7B; break;
+                    case 28: model.type = e_model::MODEL_57B_A14B; break;
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;

From 387952651a8fc493f8c85ea4c9774bd4a5694f87 Mon Sep 17 00:00:00 2001
From: Raj Hammeer Singh Hada 
Date: Thu, 27 Jun 2024 20:09:29 +0530
Subject: [PATCH 15/29] Delete examples/llama.android/llama/CMakeLists.txt
 (#8165)

* Delete examples/llama.android/llama/CMakeLists.txt

https://github.com/ggerganov/llama.cpp/pull/8145#issuecomment-2194534244

This file is not being used for building on Android. `llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt` is being used instead.

* Update CMakeLists.txt

Pick local llama.cpp files instead of fetching content from git
---
 examples/llama.android/llama/CMakeLists.txt   | 55 -------------------
 .../llama/src/main/cpp/CMakeLists.txt         | 18 +++---
 2 files changed, 11 insertions(+), 62 deletions(-)
 delete mode 100644 examples/llama.android/llama/CMakeLists.txt

diff --git a/examples/llama.android/llama/CMakeLists.txt b/examples/llama.android/llama/CMakeLists.txt
deleted file mode 100644
index a5618cac0..000000000
--- a/examples/llama.android/llama/CMakeLists.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-
-# For more information about using CMake with Android Studio, read the
-# documentation: https://d.android.com/studio/projects/add-native-code.html.
-# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
-
-# Sets the minimum CMake version required for this project.
-cmake_minimum_required(VERSION 3.22.1)
-
-# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
-# Since this is the top level CMakeLists.txt, the project name is also accessible
-# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
-# build script scope).
-project("llama-android")
-
-## Fetch latest llama.cpp from GitHub
-#include(FetchContent)
-#FetchContent_Declare(
-#        llama
-#        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
-#        GIT_TAG        master
-#)
-#
-## Also provides "common"
-#FetchContent_MakeAvailable(llama)
-
-# llama.cpp CI uses the code from the current branch
-# ref: https://github.com/ggerganov/llama.cpp/pull/7341#issuecomment-2117617700
-add_subdirectory(../../../../../../ build-llama)
-
-# Creates and names a library, sets it as either STATIC
-# or SHARED, and provides the relative paths to its source code.
-# You can define multiple libraries, and CMake builds them for you.
-# Gradle automatically packages shared libraries with your APK.
-#
-# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
-# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
-# is preferred for the same purpose.
-#
-# In order to load a library into your app from Java/Kotlin, you must call
-# System.loadLibrary() and pass the name of the library defined here;
-# for GameActivity/NativeActivity derived applications, the same library name must be
-# used in the AndroidManifest.xml file.
-add_library(${CMAKE_PROJECT_NAME} SHARED
-    # List C/C++ source files with relative paths to this CMakeLists.txt.
-        llama-android.cpp)
-
-# Specifies libraries CMake should link to your target library. You
-# can link libraries from various origins, such as libraries defined in this
-# build script, prebuilt third-party libraries, or Android system libraries.
-target_link_libraries(${CMAKE_PROJECT_NAME}
-    # List libraries link to the target library
-    llama
-    common
-    android
-    log)
diff --git a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
index 42ebaad49..2de496574 100644
--- a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
+++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
@@ -11,15 +11,15 @@ cmake_minimum_required(VERSION 3.22.1)
 # build script scope).
 project("llama-android")
 
-include(FetchContent)
-FetchContent_Declare(
-        llama
-        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
-        GIT_TAG        master
-)
+#include(FetchContent)
+#FetchContent_Declare(
+#        llama
+#        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
+#        GIT_TAG        master
+#)
 
 # Also provides "common"
-FetchContent_MakeAvailable(llama)
+#FetchContent_MakeAvailable(llama)
 
 # Creates and names a library, sets it as either STATIC
 # or SHARED, and provides the relative paths to its source code.
@@ -30,6 +30,10 @@ FetchContent_MakeAvailable(llama)
 # the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
 # is preferred for the same purpose.
 #
+
+#load local llama.cpp
+add_subdirectory(../../../../../../ build-llama)
+
 # In order to load a library into your app from Java/Kotlin, you must call
 # System.loadLibrary() and pass the name of the library defined here;
 # for GameActivity/NativeActivity derived applications, the same library name must be

From 97877eb10bd8e7f8023420b5b5300bcbdadd62dc Mon Sep 17 00:00:00 2001
From: jukofyork <69222624+jukofyork@users.noreply.github.com>
Date: Thu, 27 Jun 2024 15:48:07 +0100
Subject: [PATCH 16/29] Control vector loading fixes (#8137)

* Fixed leak in llama_control_vector_load_one() and allow llama_control_vector_load() to grow

* refactored `llama_control_vector_load_one()`

* allow multiple directions for same layer in same file

* llama_control_vector_load_one() and llama_control_vector_load() now break on error

* removed unnecessary ggml_free() call
---
 common/common.cpp | 186 +++++++++++++++++++---------------------------
 1 file changed, 76 insertions(+), 110 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index c76d0e2c3..70349ad70 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2804,125 +2804,87 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
 //
 
 static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
-    int32_t n_tensors;
-
-    size_t n_bytes = 0;
-
-    uint32_t max_direction_layer = 0;
-
     llama_control_vector_data result = { -1, {} };
 
-    // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
-    {
-        struct ggml_init_params meta_params = {
-            /* .mem_size   = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
-            /* .mem_buffer = */ nullptr,
-            /* .no_alloc   = */ true,
-        };
-        ggml_context * meta_ctx = ggml_init(meta_params);
-        struct gguf_init_params meta_gguf_params = {
-            /* .no_alloc = */ true,
-            /* .ctx      = */ &meta_ctx,
-        };
-        struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
-        if (!meta_ctx_gguf) {
-            fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
-            ggml_free(meta_ctx);
-            return result;
-        }
-
-        n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
-        for (int i = 0; i < n_tensors; i++) {
-            std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);
-
-            // split on '.'
-            size_t dotpos = name.find('.');
-            if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
-                try {
-                    uint32_t layer = std::stoi(name.substr(dotpos + 1));
-                    if (layer == 0) {
-                        fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
-                        ggml_free(meta_ctx);
-                        gguf_free(meta_ctx_gguf);
-                        return result;
-                    }
-                    if (layer > max_direction_layer) {
-                        max_direction_layer = layer;
-                    }
-                } catch (...) {
-                    fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
-                    ggml_free(meta_ctx);
-                    gguf_free(meta_ctx_gguf);
-                    return result;
-                }
-            }
-
-            struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
-            if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
-                fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
-                ggml_free(meta_ctx);
-                gguf_free(meta_ctx_gguf);
-                return result;
-            }
-            if (result.n_embd == -1) {
-                result.n_embd = ggml_nelements(tensor_meta);
-            } else if (ggml_nelements(tensor_meta) != result.n_embd) {
-                fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str());
-                ggml_free(meta_ctx);
-                gguf_free(meta_ctx_gguf);
-                return result;
-            }
-            n_bytes += ggml_nbytes(tensor_meta);
-        }
-        ggml_free(meta_ctx);
-        gguf_free(meta_ctx_gguf);
+    ggml_context * ctx = nullptr;
+    struct gguf_init_params meta_gguf_params = {
+        /* .no_alloc = */ false,
+        /* .ctx      = */ &ctx,
+    };
+    struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
+    if (!ctx_gguf) {
+        fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
+        return result;
     }
 
+    int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
     if (n_tensors == 0) {
         fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
-        return result;
     }
 
-    // load and scale tensors into final control vector context
-    struct ggml_init_params ggml_params = {
-        /* .mem_size   = */ ggml_tensor_overhead() * n_tensors + n_bytes,
-        /* .mem_buffer = */ nullptr,
-        /* .no_alloc   = */ false,
-    };
-    struct ggml_context * ctx = ggml_init(ggml_params);
+    for (int i = 0; i < n_tensors; i++) {
+        std::string name = gguf_get_tensor_name(ctx_gguf, i);
 
-    struct gguf_init_params params = {
-        /*.no_alloc = */ false,
-        /*.ctx      = */ &ctx,
-    };
-    struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params);
-    if (!ctx_gguf) {
-        fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
-        ggml_free(ctx);
-        return result;
-    }
+        int layer_idx = -1;
 
-    // do not store data for layer 0 (it's not used)
-    result.data.resize(result.n_embd * max_direction_layer);
-
-    for (uint32_t il = 1; il <= max_direction_layer; il++) {
-        const std::string name = "direction." + std::to_string(il);
-        const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
-
-        float * dst = result.data.data() + result.n_embd * (il - 1);
-
-        if (tensor) {
-            const float * src = (const float *) tensor->data;
-            for (int j = 0; j < result.n_embd; j++) {
-                dst[j] = src[j] * load_info.strength;
-            }
-        } else {
-            for (int j = 0; j < result.n_embd; j++) {
-                dst[j] = 0.0f;
+        // split on '.'
+        size_t dotpos = name.find('.');
+        if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
+            try {
+                layer_idx = std::stoi(name.substr(dotpos + 1));
+            } catch (...) {
+                layer_idx = -1;
             }
         }
+        if (layer_idx < 0) {
+            fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        } else if (layer_idx == 0) {
+            fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+
+        struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
+        if (tensor->type != GGML_TYPE_F32) {
+            fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+        if (ggml_n_dims(tensor) != 1) {
+            fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+
+        if (result.n_embd == -1) {
+            result.n_embd = ggml_nelements(tensor);
+        } else if (ggml_nelements(tensor) != result.n_embd) {
+            fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
+            result.n_embd = -1;
+            break;
+        }
+
+        // extend if necessary - do not store data for layer 0 (it's not used)
+        result.data.resize(std::max(result.data.size(), static_cast(result.n_embd * layer_idx)), 0.0f);
+
+        const float * src = (const float *) tensor->data;
+        float * dst = result.data.data() + result.n_embd * (layer_idx - 1);  // layer 1 at [0]
+        for (int j = 0; j < result.n_embd; j++) {
+            dst[j] += src[j] * load_info.strength;  // allows multiple directions for same layer in same file
+        }
+
     }
 
+    if (result.n_embd == -1) {
+        fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
+        result.data.clear();
+    }
+
+    gguf_free(ctx_gguf);
+    ggml_free(ctx);
+
     return result;
 }
 
@@ -2933,16 +2895,19 @@ llama_control_vector_data llama_control_vector_load(const std::vector
Date: Thu, 27 Jun 2024 18:37:29 +0300
Subject: [PATCH 17/29] flake.lock: Update (#8071)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Flake lock file updates:

• Updated input 'nixpkgs':
    'github:NixOS/nixpkgs/e9ee548d90ff586a6471b4ae80ae9cfcbceb3420?narHash=sha256-4Zu0RYRcAY/VWuu6awwq4opuiD//ahpc2aFHg2CWqFY%3D' (2024-06-13)
  → 'github:NixOS/nixpkgs/d603719ec6e294f034936c0d0dc06f689d91b6c3?narHash=sha256-k3JqJrkdoYwE3fHE6xGDY676AYmyh4U2Zw%2B0Bwe5DLU%3D' (2024-06-20)

Co-authored-by: github-actions[bot] 
Co-authored-by: Philip Taron 
---
 flake.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flake.lock b/flake.lock
index 5278fb68a..79bb3f63f 100644
--- a/flake.lock
+++ b/flake.lock
@@ -20,11 +20,11 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1718318537,
-        "narHash": "sha256-4Zu0RYRcAY/VWuu6awwq4opuiD//ahpc2aFHg2CWqFY=",
+        "lastModified": 1718895438,
+        "narHash": "sha256-k3JqJrkdoYwE3fHE6xGDY676AYmyh4U2Zw+0Bwe5DLU=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "e9ee548d90ff586a6471b4ae80ae9cfcbceb3420",
+        "rev": "d603719ec6e294f034936c0d0dc06f689d91b6c3",
         "type": "github"
       },
       "original": {

From 16791b8f0b4526aafbf5d0e5bbbd2e99c2253418 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen 
Date: Thu, 27 Jun 2024 18:14:19 +0200
Subject: [PATCH 18/29] Add chatml fallback for cpp `llama_chat_apply_template`
 (#8160)

* add chatml fallback for cpp `llama_chat_apply_template`

* remove redundant code
---
 common/common.cpp | 19 ++++++++++++++++++-
 common/common.h   |  2 ++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index 70349ad70..57d03a578 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2618,6 +2618,7 @@ std::string llama_chat_apply_template(const struct llama_model * model,
         const std::vector & msgs,
         bool add_ass) {
     int alloc_size = 0;
+    bool fallback = false; // indicate if we must fallback to default chatml
     std::vector chat;
     for (auto & msg : msgs) {
         chat.push_back({msg.role.c_str(), msg.content.c_str()});
@@ -2630,10 +2631,26 @@ std::string llama_chat_apply_template(const struct llama_model * model,
     // run the first time to get the total output length
     int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
 
+    // error: chat template is not supported
+    if (res < 0) {
+        if (ptr_tmpl != nullptr) {
+            // if the custom "tmpl" is not supported, we throw an error
+            // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
+            throw std::runtime_error("this custom template is not supported");
+        } else {
+            // If the built-in template is not supported, we default to chatml
+            res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+            fallback = true;
+        }
+    }
+
     // if it turns out that our buffer is too small, we resize it
     if ((size_t) res > buf.size()) {
         buf.resize(res);
-        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+        res = llama_chat_apply_template(
+            fallback ? nullptr : model,
+            fallback ? "chatml" : ptr_tmpl,
+            chat.data(), chat.size(), add_ass, buf.data(), buf.size());
     }
 
     std::string formatted_chat(buf.data(), res);
diff --git a/common/common.h b/common/common.h
index c541204f6..0486ba380 100644
--- a/common/common.h
+++ b/common/common.h
@@ -380,6 +380,8 @@ struct llama_chat_msg {
 bool llama_chat_verify_template(const std::string & tmpl);
 
 // CPP wrapper for llama_chat_apply_template
+// If the built-in template is not supported, we default to chatml
+// If the custom "tmpl" is not supported, we throw an error
 std::string llama_chat_apply_template(const struct llama_model * model,
         const std::string & tmpl,
         const std::vector & chat,

From 8172ee9da9921ca53d698c7438c2d792b3f3f2c8 Mon Sep 17 00:00:00 2001
From: slaren 
Date: Thu, 27 Jun 2024 20:04:39 +0200
Subject: [PATCH 19/29] cmake : fix deprecated option names not working (#8171)

* cmake : fix deprecated option names not working

* remove LlAMA_OPENMP
---
 CMakeLists.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7a7197282..dba083089 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,7 +86,7 @@ set(GGML_CUDA_USE_GRAPHS    ON)
 function (llama_option_depr TYPE OLD NEW)
     if (${OLD})
         message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
-        set(${NEW} ON)
+        set(${NEW} ON PARENT_SCOPE)
     endif()
 endfunction()
 
@@ -96,7 +96,6 @@ llama_option_depr(WARNING     LLAMA_KOMPUTE             GGML_KOMPUTE)
 llama_option_depr(WARNING     LLAMA_METAL               GGML_METAL)
 llama_option_depr(WARNING     LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
 llama_option_depr(WARNING     LLAMA_NATIVE              GGML_NATIVE)
-llama_option_depr(WARNING     LLAMA_OPENMP              GGML_OPENMP)
 llama_option_depr(WARNING     LLAMA_RPC                 GGML_RPC)
 llama_option_depr(WARNING     LLAMA_SYCL                GGML_SYCL)
 llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)

From 558f44bf83d78242d4e5c4ab98d0be9125cb9780 Mon Sep 17 00:00:00 2001
From: loonerin <132926317+loonerin@users.noreply.github.com>
Date: Thu, 27 Jun 2024 15:01:23 -0400
Subject: [PATCH 20/29] CI: fix release build (Ubuntu+Mac) (#8170)

* CI: fix release build (Ubuntu)

PR #8006 changes defaults to build shared libs. However, CI for releases
expects static builds.

* CI: fix release build (Mac)

---------

Co-authored-by: loonerin 
---
 .github/workflows/build.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 208515287..adf67cecc 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -47,7 +47,7 @@ jobs:
           sysctl -a
           mkdir build
           cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF ..
           cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
 
       - name: Test
@@ -105,7 +105,7 @@ jobs:
           sysctl -a
           # Metal is disabled due to intermittent failures with Github runners not having a GPU:
           # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON
+          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
           cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
 
       - name: Test
@@ -222,7 +222,7 @@ jobs:
         run: |
           mkdir build
           cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
           cmake --build . --config Release -j $(nproc)
 
       - name: Test

From cb0b06a8a613f7a2ccb7253b2a3c00fdd397ba1c Mon Sep 17 00:00:00 2001
From: Olivier Chafik 
Date: Thu, 27 Jun 2024 22:08:42 +0100
Subject: [PATCH 21/29] `json`: update grammars/README w/ examples & note about
 additionalProperties (#8132)

* json: update grammars/README

* mention broken prefixItems

* add mention to llama-gbnf-validator

* json: explicit type: object for nested items object in cli example
---
 grammars/README.md | 245 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 235 insertions(+), 10 deletions(-)

diff --git a/grammars/README.md b/grammars/README.md
index 2f685eb6d..40f666240 100644
--- a/grammars/README.md
+++ b/grammars/README.md
@@ -126,19 +126,244 @@ You can use GBNF grammars:
     - in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
     - in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
 
-Take a look at [tests](../../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
+Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
 
-Here is also a non-exhaustive list of **unsupported** features:
+```bash
+llama-cli \
+  -hfr bartowski/Phi-3-medium-128k-instruct-GGUF \
+  -hff Phi-3-medium-128k-instruct-Q8_0.gguf \
+  -j '{
+    "type": "array",
+    "items": {
+        "type": "object",
+        "properties": {
+            "name": {
+                "type": "string",
+                "minLength": 1,
+                "maxLength": 100
+            },
+            "age": {
+                "type": "integer",
+                "minimum": 0,
+                "maximum": 150
+            }
+        },
+        "required": ["name", "age"],
+        "additionalProperties": false
+    },
+    "minItems": 10,
+    "maxItems": 100
+  }' \
+  -p 'Generate a {name, age}[] JSON array with famous actors of all ages.'
+```
 
-- `additionalProperties`: to be fixed in https://github.com/ggerganov/llama.cpp/pull/7840
-- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`
-    - `integer` constraints to be implemented in https://github.com/ggerganov/llama.cpp/pull/7797
-- Remote `$ref`s in the C++ version (Python & JavaScript versions fetch https refs)
-- Mixing `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703)
-- `string` formats `uri`, `email`
+
+ +Show grammar + +You can convert any schema in command-line with: + +```bash +examples/json_schema_to_grammar.py name-age-schema.json +``` + +``` +char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4}) +item ::= "{" space item-name-kv "," space item-age-kv "}" space +item-age ::= ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-4] [0-9] | [5] "0")) space +item-age-kv ::= "\"age\"" space ":" space item-age +item-name ::= "\"" char{1,100} "\"" space +item-name-kv ::= "\"name\"" space ":" space item-name +root ::= "[" space item ("," space item){9,99} "]" space +space ::= | " " | "\n" [ \t]{0,20} +``` + +
+ +Here is also a list of known limitations (contributions welcome): + +- Unsupported features are skipped silently. It is currently advised to use the command-line Python converter (see above) to see any warnings, and to inspect the resulting grammar / test it w/ [llama-gbnf-validator](../examples/gbnf-validator/gbnf-validator.cpp). +- Can't mix `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703) +- [prefixItems](https://json-schema.org/draft/2020-12/json-schema-core#name-prefixitems) is broken (but [items](https://json-schema.org/draft/2020-12/json-schema-core#name-items) works) +- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`: only supported for `"type": "integer"` for now, not `number` +- Nested `$ref`s are broken (https://github.com/ggerganov/llama.cpp/issues/8073) +- [pattern](https://json-schema.org/draft/2020-12/json-schema-validation#name-pattern)s must start with `^` and end with `$` +- Remote `$ref`s not supported in the C++ version (Python & JavaScript versions fetch https refs) +- `string` [formats](https://json-schema.org/draft/2020-12/json-schema-validation#name-defined-formats) lack `uri`, `email` +- No [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties) + +And a non-exhaustive list of other unsupported features that are unlikely to be implemented (hard and/or too slow to support w/ stateless grammars): + +- [`uniqueItems`](https://json-schema.org/draft/2020-12/json-schema-validation#name-uniqueitems) - [`contains`](https://json-schema.org/draft/2020-12/json-schema-core#name-contains) / `minContains` -- `uniqueItems` - `$anchor` (cf. [dereferencing](https://json-schema.org/draft/2020-12/json-schema-core#name-dereferencing)) - [`not`](https://json-schema.org/draft/2020-12/json-schema-core#name-not) - [Conditionals](https://json-schema.org/draft/2020-12/json-schema-core#name-keywords-for-applying-subsche) `if` / `then` / `else` / `dependentSchemas` -- [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties) + +### A word about additionalProperties + +> [!WARNING] +> By default, `object`s accept [additional properties](https://json-schema.org/understanding-json-schema/reference/object#additionalproperties), which you might not want / not expect, and which will make sampling slower (not just because of the extra tokens, but also generates a slower grammar). +> You can set `"additionalProperties": false` on the schema of any object to ensure only properties listed in `properties` are generated (not needed for non-`object` types, e.g. `array` or `string`). + +If you're using [Pydantic](https://pydantic.dev/) to generate schemas, you can disable additional properties with the `extra` config on each model class: + +```python +# pip install pydantic +import json +from typing import Annotated, List +from pydantic import BaseModel, Extra, Field +class QAPair(BaseModel): + class Config: + extra = 'forbid' # triggers additionalProperties: false in the JSON schema + question: str + concise_answer: str + justification: str + +class Summary(BaseModel): + class Config: + extra = 'forbid' + key_facts: List[Annotated[str, Field(pattern='- .{5,}')]] + question_answers: List[Annotated[List[QAPair], Field(min_items=5)]] + +print(json.dumps(Summary.model_json_schema(), indent=2)) +``` + +
+Show JSON schema & grammar + +```json +{ + "$defs": { + "QAPair": { + "additionalProperties": false, + "properties": { + "question": { + "title": "Question", + "type": "string" + }, + "concise_answer": { + "title": "Concise Answer", + "type": "string" + }, + "justification": { + "title": "Justification", + "type": "string" + } + }, + "required": [ + "question", + "concise_answer", + "justification" + ], + "title": "QAPair", + "type": "object" + } + }, + "additionalProperties": false, + "properties": { + "key_facts": { + "items": { + "pattern": "^- .{5,}$", + "type": "string" + }, + "title": "Key Facts", + "type": "array" + }, + "question_answers": { + "items": { + "items": { + "$ref": "#/$defs/QAPair" + }, + "minItems": 5, + "type": "array" + }, + "title": "Question Answers", + "type": "array" + } + }, + "required": [ + "key_facts", + "question_answers" + ], + "title": "Summary", + "type": "object" +} +``` + +``` +QAPair ::= "{" space QAPair-question-kv "," space QAPair-concise-answer-kv "," space QAPair-justification-kv "}" space +QAPair-concise-answer-kv ::= "\"concise_answer\"" space ":" space string +QAPair-justification-kv ::= "\"justification\"" space ":" space string +QAPair-question-kv ::= "\"question\"" space ":" space string +char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4}) +dot ::= [^\x0A\x0D] +key-facts ::= "[" space (key-facts-item ("," space key-facts-item)*)? "]" space +key-facts-item ::= "\"" "- " key-facts-item-1{5,} "\"" space +key-facts-item-1 ::= dot +key-facts-kv ::= "\"key_facts\"" space ":" space key-facts +question-answers ::= "[" space (question-answers-item ("," space question-answers-item)*)? "]" space +question-answers-item ::= "[" space question-answers-item-item ("," space question-answers-item-item){4,} "]" space +question-answers-item-item ::= QAPair +question-answers-kv ::= "\"question_answers\"" space ":" space question-answers +root ::= "{" space key-facts-kv "," space question-answers-kv "}" space +space ::= | " " | "\n" [ \t]{0,20} +string ::= "\"" char* "\"" space +``` + +
+ +If you're using [Zod](https://zod.dev/), you can make your objects explicitly strict w/ `z.object(...).strict()` or `z.strictObject(...)`. + +Note however that [zod-to-json-schema](https://github.com/StefanTerdell/zod-to-json-schema) currently always seems to set `"additionalProperties": false` anyway (even w/ zod schemas on which `nonstrict()` / `passthrough()` was called). + +```js +import { z } from 'zod'; +import { zodToJsonSchema } from 'zod-to-json-schema'; + +const Foo = z.object({ + age: z.number().positive(), + email: z.string().email(), +}).strict(); + +console.log(zodToJsonSchema(Foo)); +``` + +
+Show JSON schema & grammar + +```json +{ + "type": "object", + "properties": { + "age": { + "type": "number", + "exclusiveMinimum": 0 + }, + "email": { + "type": "string", + "format": "email" + } + }, + "required": [ + "age", + "email" + ], + "additionalProperties": false, + "$schema": "http://json-schema.org/draft-07/schema#" +} +``` + +``` +age-kv ::= "\"age\"" space ":" space number +char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4}) +decimal-part ::= [0-9]{1,16} +email-kv ::= "\"email\"" space ":" space string +integral-part ::= [0] | [1-9] [0-9]{0,15} +number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space +root ::= "{" space age-kv "," space email-kv "}" space +space ::= | " " | "\n" [ \t]{0,20} +string ::= "\"" char* "\"" space +``` + +
From a27aa50ab7e07fe46aae619076b6e31d5663e914 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 28 Jun 2024 02:19:11 +0200 Subject: [PATCH 22/29] Add missing items in makefile (#8177) --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index bbfe0f12b..8ae4f1dc4 100644 --- a/Makefile +++ b/Makefile @@ -45,6 +45,7 @@ BUILD_TARGETS = \ TEST_TARGETS = \ tests/test-autorelease \ tests/test-backend-ops \ + tests/test-chat-template \ tests/test-double-float \ tests/test-grad0 \ tests/test-grammar-integration \ @@ -1070,6 +1071,7 @@ clean: rm -rvf src/*.o rm -rvf tests/*.o rm -rvf examples/*.o + rm -rvf common/*.o rm -rvf *.a rm -rvf *.dll rm -rvf *.so From e57dc62057d41211ac018056c19c02cd544694df Mon Sep 17 00:00:00 2001 From: pculliton Date: Fri, 28 Jun 2024 00:00:43 -0400 Subject: [PATCH 23/29] llama: Add support for Gemma2ForCausalLM (#8156) * Inference support for Gemma 2 model family * Update convert-hf-to-gguf.py, constants, and tensor mappings * cleanup * format fix * Fix special token vocab bug * Don't add space prefix * fix deleted lines * Update src/llama.cpp Co-authored-by: slaren * Add model type names * Add control vector * Fix model type identification --------- Co-authored-by: Andrei Betlen Co-authored-by: slaren --- convert-hf-to-gguf.py | 40 +++++++ gguf-py/gguf/constants.py | 23 ++++ gguf-py/gguf/tensor_mapping.py | 14 +++ src/llama.cpp | 198 ++++++++++++++++++++++++++++++++- 4 files changed, 274 insertions(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 5bf69ef9f..5bcc849db 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2340,6 +2340,46 @@ class GemmaModel(Model): return [(self.map_tensor_name(name), data_torch)] +@Model.register("Gemma2ForCausalLM") +class Gemma2Model(Model): + model_arch = gguf.MODEL_ARCH.GEMMA2 + + def set_vocab(self): + self._set_vocab_llama_hf() + self.gguf_writer.add_add_space_prefix(False) + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_key_length(hparams["head_dim"]) + self.gguf_writer.add_value_length(hparams["head_dim"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unusem + + # lm_head is not used in llama.cpp, while autoawq will include this tensor in model + # To prevent errors, skip loading lm_head.weight. + if name == "lm_head.weight": + logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") + return [] + + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 + if name.endswith("norm.weight"): + data_torch = data_torch + 1 + + return [(self.map_tensor_name(name), data_torch)] + + @Model.register("Starcoder2ForCausalLM") class StarCoder2Model(Model): model_arch = gguf.MODEL_ARCH.STARCODER2 diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 222a2d137..cf3d09e70 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -150,6 +150,7 @@ class MODEL_ARCH(IntEnum): INTERNLM2 = auto() MINICPM = auto() GEMMA = auto() + GEMMA2 = auto() STARCODER2 = auto() MAMBA = auto() XVERSE = auto() @@ -180,10 +181,13 @@ class MODEL_TENSOR(IntEnum): ATTN_NORM = auto() ATTN_NORM_2 = auto() ATTN_OUT_NORM = auto() + ATTN_POST_NORM = auto() ATTN_ROT_EMBD = auto() FFN_GATE_INP = auto() FFN_GATE_INP_SHEXP = auto() FFN_NORM = auto() + FFN_PRE_NORM = auto() + FFN_POST_NORM = auto() FFN_GATE = auto() FFN_DOWN = auto() FFN_UP = auto() @@ -270,6 +274,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.INTERNLM2: "internlm2", MODEL_ARCH.MINICPM: "minicpm", MODEL_ARCH.GEMMA: "gemma", + MODEL_ARCH.GEMMA2: "gemma2", MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.MAMBA: "mamba", MODEL_ARCH.XVERSE: "xverse", @@ -303,9 +308,12 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", + MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm", MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp", MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm", MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", @@ -751,6 +759,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_NORM, ], + MODEL_ARCH.GEMMA2: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.FFN_PRE_NORM, + MODEL_TENSOR.FFN_POST_NORM, + ], MODEL_ARCH.STARCODER2: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 7b047f241..0bed43939 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -187,6 +187,10 @@ class TensorNameMap: "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx ), + MODEL_TENSOR.ATTN_POST_NORM: ( + "model.layers.{bid}.post_attention_layernorm", # gemma2 + ), + # Rotary embeddings MODEL_TENSOR.ATTN_ROT_EMBD: ( "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf @@ -210,6 +214,16 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.rms_norm_2", # Grok ), + # Post feed-forward norm + MODEL_TENSOR.FFN_PRE_NORM: ( + "model.layers.{bid}.pre_feedforward_layernorm", # gemma2 + ), + + # Post feed-forward norm + MODEL_TENSOR.FFN_POST_NORM: ( + "model.layers.{bid}.post_feedforward_layernorm", # gemma2 + ), + MODEL_TENSOR.FFN_GATE_INP: ( "layers.{bid}.feed_forward.gate", # mixtral "model.layers.{bid}.block_sparse_moe.gate", # mixtral diff --git a/src/llama.cpp b/src/llama.cpp index 3dc0f8535..988ed4fdf 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -217,6 +217,7 @@ enum llm_arch { LLM_ARCH_INTERNLM2, LLM_ARCH_MINICPM, LLM_ARCH_GEMMA, + LLM_ARCH_GEMMA2, LLM_ARCH_STARCODER2, LLM_ARCH_MAMBA, LLM_ARCH_XVERSE, @@ -257,6 +258,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_INTERNLM2, "internlm2" }, { LLM_ARCH_MINICPM, "minicpm" }, { LLM_ARCH_GEMMA, "gemma" }, + { LLM_ARCH_GEMMA2, "gemma2" }, { LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_MAMBA, "mamba" }, { LLM_ARCH_XVERSE, "xverse" }, @@ -478,10 +480,12 @@ enum llm_tensor { LLM_TENSOR_ATTN_NORM, LLM_TENSOR_ATTN_NORM_2, LLM_TENSOR_ATTN_OUT_NORM, + LLM_TENSOR_ATTN_POST_NORM, LLM_TENSOR_ATTN_ROT_EMBD, LLM_TENSOR_FFN_GATE_INP, LLM_TENSOR_FFN_GATE_INP_SHEXP, LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_POST_NORM, LLM_TENSOR_FFN_GATE, LLM_TENSOR_FFN_DOWN, LLM_TENSOR_FFN_UP, @@ -1004,6 +1008,24 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_GEMMA2, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, + }, + }, { LLM_ARCH_STARCODER2, { @@ -2039,6 +2061,8 @@ enum e_model { MODEL_16x12B, MODEL_10B_128x3_66B, MODEL_57B_A14B, + MODEL_9B, + MODEL_27B, }; static const size_t kiB = 1024; @@ -2215,6 +2239,7 @@ struct llama_layer { struct ggml_tensor * attn_q_a_norm; struct ggml_tensor * attn_kv_a_norm; struct ggml_tensor * attn_sub_norm; + struct ggml_tensor * attn_post_norm; struct ggml_tensor * ffn_sub_norm; // attention @@ -2238,6 +2263,7 @@ struct llama_layer { // normalization struct ggml_tensor * ffn_norm; struct ggml_tensor * ffn_norm_b; + struct ggml_tensor * ffn_post_norm; struct ggml_tensor * layer_out_norm; struct ggml_tensor * layer_out_norm_b; struct ggml_tensor * ffn_norm_exps; @@ -4269,6 +4295,8 @@ static const char * llama_model_type_name(e_model type) { case MODEL_16x12B: return "16x12B"; case MODEL_10B_128x3_66B: return "10B+128x3.66B"; case MODEL_57B_A14B: return "57B.A14B"; + case MODEL_9B: return "9B"; + case MODEL_27B: return "27B"; default: return "?B"; } } @@ -4671,6 +4699,16 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_GEMMA2: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + switch (hparams.n_layer) { + case 42: model.type = e_model::MODEL_9B; break; + case 46: model.type = e_model::MODEL_27B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_STARCODER2: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -6512,6 +6550,40 @@ static bool llm_load_tensors( layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); } } break; + case LLM_ARCH_GEMMA2: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading + + const int64_t n_ff = hparams.n_ff; + const int64_t n_embd_head_k = hparams.n_embd_head_k; + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + + for (uint32_t i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd}); + layer.attn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}); + } + } break; case LLM_ARCH_STARCODER2: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -10923,6 +10995,125 @@ struct llm_build_context { return gf; } + struct ggml_cgraph * build_gemma2() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head_k = hparams.n_embd_head_k; + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); + cb(inpL, "inp_scaled", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr, + n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur", il); + + Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); + cb(Qcur, "Qcur_scaled", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr, + n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); + } + + cur = llm_build_norm(ctx0, cur, hparams, + model.layers[il].attn_post_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_post_norm", il); + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); + cb(sa_out, "sa_out", il); + + cur = llm_build_norm(ctx0, sa_out, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + // feed-forward network + { + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_GELU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + cur = llm_build_norm(ctx0, cur, hparams, + model.layers[il].ffn_post_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "ffn_post_norm", -1); + + cur = ggml_add(ctx0, cur, sa_out); + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_starcoder2() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -12303,6 +12494,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_gemma(); } break; + case LLM_ARCH_GEMMA2: + { + result = llm.build_gemma2(); + } break; case LLM_ARCH_STARCODER2: { result = llm.build_starcoder2(); @@ -17597,6 +17792,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_PHI2: case LLM_ARCH_PHI3: case LLM_ARCH_GEMMA: + case LLM_ARCH_GEMMA2: case LLM_ARCH_STARCODER2: case LLM_ARCH_GPTNEOX: return LLAMA_ROPE_TYPE_NEOX; @@ -19486,7 +19682,7 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "assistant\n"; } - } else if (tmpl == "gemma" || tmpl.find("") != std::string::npos) { + } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl.find("") != std::string::npos) { // google/gemma-7b-it std::string system_prompt = ""; for (auto message : chat) { From 139cc621e90b4f61830515c3c124cf35b3d7a6dc Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Fri, 28 Jun 2024 09:26:45 +0100 Subject: [PATCH 24/29] `json`: restore default additionalProperties to false, fix some pattern escapes (#8180) * json: expand ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS charset * json: revert default of additionalProperties to false * Update README.md --- common/json-schema-to-grammar.cpp | 4 +- examples/json_schema_to_grammar.py | 6 +-- .../server/public/json-schema-to-grammar.mjs | 4 +- grammars/README.md | 37 ++++++++++++------ tests/test-grammar-integration.cpp | 39 ++++++++++++++++++- tests/test-json-schema-to-grammar.cpp | 31 ++------------- 6 files changed, 73 insertions(+), 48 deletions(-) diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 2f233e2e7..881eb49e3 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -316,7 +316,7 @@ std::unordered_map GRAMMAR_LITERAL_ESCAPES = { }; std::unordered_set NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'}; -std::unordered_set ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'}; +std::unordered_set ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'}; template std::string join(Iterator begin, Iterator end, const std::string & separator) { @@ -720,7 +720,7 @@ private: } prop_names.push_back(prop_name); } - if (!(additional_properties.is_boolean() && !additional_properties.get())) { + if ((additional_properties.is_boolean() && additional_properties.get()) || additional_properties.is_object()) { std::string sub_name = name + (name.empty() ? "" : "-") + "additional"; std::string value_rule = additional_properties.is_object() ? visit(additional_properties, sub_name + "-value") diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index 92f6e3d47..072a230f7 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -231,7 +231,7 @@ GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]') GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'} NON_LITERAL_SET = set('|.()[]{}*+?') -ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?') +ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('^$.[]()|{}*+?') class SchemaConverter: @@ -602,7 +602,7 @@ class SchemaConverter: else: add_component(t, is_required=True) - return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=[])) + return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None)) elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema): items = schema.get('items') or schema['prefixItems'] @@ -691,7 +691,7 @@ class SchemaConverter: required_props = [k for k in sorted_props if k in required] optional_props = [k for k in sorted_props if k not in required] - if additional_properties != False: + if additional_properties is not None and additional_properties != False: sub_name = f'{name}{"-" if name else ""}additional' value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \ self._add_primitive('value', PRIMITIVE_RULES['value']) diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs index 06d76edde..7267f3f9c 100644 --- a/examples/server/public/json-schema-to-grammar.mjs +++ b/examples/server/public/json-schema-to-grammar.mjs @@ -259,7 +259,7 @@ const GRAMMAR_RANGE_LITERAL_ESCAPE_RE = /[\n\r"\]\-\\]/g; const GRAMMAR_LITERAL_ESCAPES = { '\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]' }; const NON_LITERAL_SET = new Set('|.()[]{}*+?'); -const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('[]()|{}*+?'); +const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('^$.[]()|{}*+?'); export class SchemaConverter { constructor(options) { @@ -751,7 +751,7 @@ export class SchemaConverter { const requiredProps = sortedProps.filter(k => required.has(k)); const optionalProps = sortedProps.filter(k => !required.has(k)); - if (additionalProperties !== false) { + if (additionalProperties) { const subName = `${name ?? ''}${name ? '-' : ''}additional`; const valueRule = additionalProperties != null && typeof additionalProperties === 'object' ? this.visit(additionalProperties, `${subName}-value`) diff --git a/grammars/README.md b/grammars/README.md index 40f666240..886023f77 100644 --- a/grammars/README.md +++ b/grammars/README.md @@ -182,6 +182,8 @@ space ::= | " " | "\n" [ \t]{0,20} Here is also a list of known limitations (contributions welcome): +- `additionalProperties` defaults to `false` (produces faster grammars + reduces hallucinations). +- `"additionalProperties": true` may produce keys that contain unescaped newlines. - Unsupported features are skipped silently. It is currently advised to use the command-line Python converter (see above) to see any warnings, and to inspect the resulting grammar / test it w/ [llama-gbnf-validator](../examples/gbnf-validator/gbnf-validator.cpp). - Can't mix `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703) - [prefixItems](https://json-schema.org/draft/2020-12/json-schema-core#name-prefixitems) is broken (but [items](https://json-schema.org/draft/2020-12/json-schema-core#name-items) works) @@ -203,10 +205,11 @@ And a non-exhaustive list of other unsupported features that are unlikely to be ### A word about additionalProperties > [!WARNING] -> By default, `object`s accept [additional properties](https://json-schema.org/understanding-json-schema/reference/object#additionalproperties), which you might not want / not expect, and which will make sampling slower (not just because of the extra tokens, but also generates a slower grammar). -> You can set `"additionalProperties": false` on the schema of any object to ensure only properties listed in `properties` are generated (not needed for non-`object` types, e.g. `array` or `string`). +> The JSON schemas spec states `object`s accept [additional properties](https://json-schema.org/understanding-json-schema/reference/object#additionalproperties) by default. +> Since this is slow and seems prone to hallucinations, we default to no additional properties. +> You can set `"additionalProperties": true` in the the schema of any object to explicitly allow additional properties. -If you're using [Pydantic](https://pydantic.dev/) to generate schemas, you can disable additional properties with the `extra` config on each model class: +If you're using [Pydantic](https://pydantic.dev/) to generate schemas, you can enable additional properties with the `extra` config on each model class: ```python # pip install pydantic @@ -215,14 +218,14 @@ from typing import Annotated, List from pydantic import BaseModel, Extra, Field class QAPair(BaseModel): class Config: - extra = 'forbid' # triggers additionalProperties: false in the JSON schema + extra = 'allow' # triggers additionalProperties: true in the JSON schema question: str concise_answer: str justification: str class Summary(BaseModel): class Config: - extra = 'forbid' + extra = 'allow' key_facts: List[Annotated[str, Field(pattern='- .{5,}')]] question_answers: List[Annotated[List[QAPair], Field(min_items=5)]] @@ -236,7 +239,7 @@ print(json.dumps(Summary.model_json_schema(), indent=2)) { "$defs": { "QAPair": { - "additionalProperties": false, + "additionalProperties": true, "properties": { "question": { "title": "Question", @@ -260,7 +263,7 @@ print(json.dumps(Summary.model_json_schema(), indent=2)) "type": "object" } }, - "additionalProperties": false, + "additionalProperties": true, "properties": { "key_facts": { "items": { @@ -292,30 +295,40 @@ print(json.dumps(Summary.model_json_schema(), indent=2)) ``` ``` -QAPair ::= "{" space QAPair-question-kv "," space QAPair-concise-answer-kv "," space QAPair-justification-kv "}" space +QAPair ::= "{" space QAPair-question-kv "," space QAPair-concise-answer-kv "," space QAPair-justification-kv ( "," space ( QAPair-additional-kv ( "," space QAPair-additional-kv )* ) )? "}" space +QAPair-additional-k ::= ["] ( [c] ([o] ([n] ([c] ([i] ([s] ([e] ([_] ([a] ([n] ([s] ([w] ([e] ([r] char+ | [^"r] char*) | [^"e] char*) | [^"w] char*) | [^"s] char*) | [^"n] char*) | [^"a] char*) | [^"_] char*) | [^"e] char*) | [^"s] char*) | [^"i] char*) | [^"c] char*) | [^"n] char*) | [^"o] char*) | [j] ([u] ([s] ([t] ([i] ([f] ([i] ([c] ([a] ([t] ([i] ([o] ([n] char+ | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"a] char*) | [^"c] char*) | [^"i] char*) | [^"f] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"u] char*) | [q] ([u] ([e] ([s] ([t] ([i] ([o] ([n] char+ | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"e] char*) | [^"u] char*) | [^"cjq] char* )? ["] space +QAPair-additional-kv ::= QAPair-additional-k ":" space value QAPair-concise-answer-kv ::= "\"concise_answer\"" space ":" space string QAPair-justification-kv ::= "\"justification\"" space ":" space string QAPair-question-kv ::= "\"question\"" space ":" space string +additional-k ::= ["] ( [k] ([e] ([y] ([_] ([f] ([a] ([c] ([t] ([s] char+ | [^"s] char*) | [^"t] char*) | [^"c] char*) | [^"a] char*) | [^"f] char*) | [^"_] char*) | [^"y] char*) | [^"e] char*) | [q] ([u] ([e] ([s] ([t] ([i] ([o] ([n] ([_] ([a] ([n] ([s] ([w] ([e] ([r] ([s] char+ | [^"s] char*) | [^"r] char*) | [^"e] char*) | [^"w] char*) | [^"s] char*) | [^"n] char*) | [^"a] char*) | [^"_] char*) | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"e] char*) | [^"u] char*) | [^"kq] char* )? ["] space +additional-kv ::= additional-k ":" space value +array ::= "[" space ( value ("," space value)* )? "]" space +boolean ::= ("true" | "false") space char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4}) +decimal-part ::= [0-9]{1,16} dot ::= [^\x0A\x0D] +integral-part ::= [0] | [1-9] [0-9]{0,15} key-facts ::= "[" space (key-facts-item ("," space key-facts-item)*)? "]" space key-facts-item ::= "\"" "- " key-facts-item-1{5,} "\"" space key-facts-item-1 ::= dot key-facts-kv ::= "\"key_facts\"" space ":" space key-facts +null ::= "null" space +number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space +object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space question-answers ::= "[" space (question-answers-item ("," space question-answers-item)*)? "]" space question-answers-item ::= "[" space question-answers-item-item ("," space question-answers-item-item){4,} "]" space question-answers-item-item ::= QAPair question-answers-kv ::= "\"question_answers\"" space ":" space question-answers -root ::= "{" space key-facts-kv "," space question-answers-kv "}" space +root ::= "{" space key-facts-kv "," space question-answers-kv ( "," space ( additional-kv ( "," space additional-kv )* ) )? "}" space space ::= | " " | "\n" [ \t]{0,20} string ::= "\"" char* "\"" space +value ::= object | array | string | number | boolean | null ``` -If you're using [Zod](https://zod.dev/), you can make your objects explicitly strict w/ `z.object(...).strict()` or `z.strictObject(...)`. - -Note however that [zod-to-json-schema](https://github.com/StefanTerdell/zod-to-json-schema) currently always seems to set `"additionalProperties": false` anyway (even w/ zod schemas on which `nonstrict()` / `passthrough()` was called). +If you're using [Zod](https://zod.dev/), you can make your objects to explicitly allow extra properties w/ `nonstrict()` / `passthrough()` (or explicitly no extra props w/ `z.object(...).strict()` or `z.strictObject(...)`) but note that [zod-to-json-schema](https://github.com/StefanTerdell/zod-to-json-schema) currently always sets `"additionalProperties": false` anyway. ```js import { z } from 'zod'; diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp index 0e21dc795..975658f79 100644 --- a/tests/test-grammar-integration.cpp +++ b/tests/test-grammar-integration.cpp @@ -993,6 +993,40 @@ static void test_json_schema() { } ); + test_schema( + "simple pattern", + // Schema + R"""({ + "pattern": "^[a-zA-Z0-9_-]*$" + })""", + // Passing strings + { + R"""("")""", + R"""("He_llo-12")""", + }, + // Failing strings + { + R"""("!")""", + R"""("Hello World")""", + } + ); + + test_schema( + "pattern with escapes", + // Schema + R"""({ + "pattern": "^a\\^\\$\\.\\[\\]\\(\\)\\|\\{\\}\\*\\+\\?b$" + })""", + // Passing strings + { + R"""("a^$.[]()|{}*+?b")""", + }, + // Failing strings + { + R"""("ab")""", + } + ); + test_schema( "", // Schema @@ -1062,8 +1096,6 @@ static void test_json_schema() { R"""({ "number": 1600, "street_name": "Pennsylvania" })""", // "By extension, even an empty object is valid" R"""({})""", - // "By default, providing additional properties is valid" - R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""", R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""", }, // Failing strings @@ -1074,6 +1106,9 @@ static void test_json_schema() { R"""({ "street_name": "Pennsylvania", "number": 1600 })""", // Reorder properties R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""", + // "Additional properties default to false for generation, even though the spec says true. + R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""", + } ); diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index 3aaa11833..720a949c7 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -1120,28 +1120,15 @@ static void test_all(const std::string & lang, std::function Date: Fri, 28 Jun 2024 12:37:45 +0200 Subject: [PATCH 25/29] cmake : allow user to override default options (#8178) --- CMakeLists.txt | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index dba083089..e3a0cc369 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -79,8 +79,15 @@ set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS}) set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED}) set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS}) set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS}) -set(GGML_LLAMAFILE ON) -set(GGML_CUDA_USE_GRAPHS ON) + +# change the default for these ggml options +if (NOT DEFINED GGML_LLAMAFILE) + set(GGML_LLAMAFILE ON) +endif() + +if (NOT DEFINED GGML_CUDA_USE_GRAPHS) + set(GGML_CUDA_USE_GRAPHS ON) +endif() # transition helpers function (llama_option_depr TYPE OLD NEW) From 38373cfbab5397cc2ab5c3694a3dee12a9e58f45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Fri, 28 Jun 2024 12:53:43 +0200 Subject: [PATCH 26/29] Add SPM infill support (#8016) * add --spm-infill option * support --spm-infill * support --spm-infill --- common/common.cpp | 6 ++++++ common/common.h | 2 ++ examples/infill/README.md | 1 + examples/infill/infill.cpp | 24 +++++++++++++----------- examples/server/README.md | 1 + examples/server/server.cpp | 16 +++++++++++----- 6 files changed, 34 insertions(+), 16 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 57d03a578..6a00d25be 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1026,6 +1026,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.input_suffix = argv[i]; return true; } + if (arg == "--spm-infill") { + params.spm_infill = true; + return true; + } if (arg == "--grammar") { CHECK_ARG sparams.grammar = argv[i]; @@ -1409,6 +1413,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" }); options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" }); options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" }); + options.push_back({ "server infill", + " --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" }); options.push_back({ "sampling" }); options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n" diff --git a/common/common.h b/common/common.h index 0486ba380..d6cb814b9 100644 --- a/common/common.h +++ b/common/common.h @@ -250,6 +250,8 @@ struct gpt_params { std::string cvector_outfile = "control_vector.gguf"; std::string cvector_positive_file = "examples/cvector-generator/positive.txt"; std::string cvector_negative_file = "examples/cvector-generator/negative.txt"; + + bool spm_infill = false; // suffix/prefix/middle pattern for infill }; void gpt_params_handle_model_default(gpt_params & params); diff --git a/examples/infill/README.md b/examples/infill/README.md index 74f42d2fc..810a0c5e7 100644 --- a/examples/infill/README.md +++ b/examples/infill/README.md @@ -15,6 +15,7 @@ In this section, we cover the most commonly used options for running the `infill - `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses. - `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text. - `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. +- `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. ## Input Prompts diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 3e82e4a81..ca71dd687 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -210,6 +210,7 @@ int main(int argc, char ** argv) { suff_rm_leading_spc = false; } std::vector embd_inp; + std::vector embd_end; std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); const int space_token = 29871; @@ -217,12 +218,13 @@ int main(int argc, char ** argv) { inp_sfx.erase(inp_sfx.begin()); } inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); - if (add_bos) { - inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model)); - } inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); - embd_inp = inp_pfx; - embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); + embd_inp = params.spm_infill ? inp_sfx : inp_pfx; + embd_end = params.spm_infill ? inp_pfx : inp_sfx; + if (add_bos) { + embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); + } + embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); const llama_token middle_token = llama_token_middle(model); if (middle_token >= 0) { @@ -526,14 +528,14 @@ int main(int argc, char ** argv) { inp_sfx.erase(inp_sfx.begin()); } inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); - if (add_bos) { - inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model)); - } inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); - embd_inp = inp_pfx; - embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); + embd_inp = params.spm_infill ? inp_sfx : inp_pfx; + embd_end = params.spm_infill ? inp_pfx : inp_sfx; + if (add_bos) { + embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); + } + embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); - const llama_token middle_token = llama_token_middle(model); if (middle_token >= 0) { embd_inp.push_back(middle_token); } diff --git a/examples/server/README.md b/examples/server/README.md index e7fb0bf64..4fab006bb 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -73,6 +73,7 @@ The project is under active development, and we are [looking for feedback and co - `-fa`, `--flash-attn` : enable flash attention (default: disabled). - `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`) - `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options) +- `--spm-infill` : Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. **If compiled with `LLAMA_SERVER_SSL=ON`** - `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key diff --git a/examples/server/server.cpp b/examples/server/server.cpp index ae768097b..d7fb61812 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2020,6 +2020,7 @@ struct server_context { slot.t_start_generation = 0; if (slot.infill) { + const bool add_bos = llama_should_add_bos_token(model); bool suff_rm_leading_spc = true; if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { params.input_suffix.erase(0, 1); @@ -2035,16 +2036,21 @@ struct server_context { } prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model)); - prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS - prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model)); - prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end()); + suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model)); + + auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens; + auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens; + if (add_bos) { + embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); + } + embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); const llama_token middle_token = llama_token_middle(model); if (middle_token >= 0) { - prefix_tokens.push_back(middle_token); + embd_inp.push_back(middle_token); } - prompt_tokens = prefix_tokens; + prompt_tokens = embd_inp; } else { prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt } From 26a39bbd6b0bbd66118bb68569f0276d7fe7df6c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 28 Jun 2024 15:11:44 +0200 Subject: [PATCH 27/29] Add MiniCPM, Deepseek V2 chat template + clean up `llama_chat_apply_template_internal` (#8172) * tmp_contains * minicpm chat template * add DeepSeek Lite template * change deepseek-lite to deepseek2 * correct code comment * correct code from master branch --- src/llama.cpp | 64 ++++++++++++++++++++++++++---------- tests/test-chat-template.cpp | 10 +++++- 2 files changed, 56 insertions(+), 18 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 988ed4fdf..3edaa98e8 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -19613,7 +19613,10 @@ static int32_t llama_chat_apply_template_internal( std::string & dest, bool add_ass) { // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527 std::stringstream ss; - if (tmpl == "chatml" || tmpl.find("<|im_start|>") != std::string::npos) { + auto tmpl_contains = [&tmpl](std::string haystack) -> bool { + return tmpl.find(haystack) != std::string::npos; + }; + if (tmpl == "chatml" || tmpl_contains("<|im_start|>")) { // chatml template for (auto message : chat) { ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n"; @@ -19621,16 +19624,16 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|im_start|>assistant\n"; } - } else if (tmpl == "llama2" || tmpl == "mistral" || tmpl.find("[INST]") != std::string::npos) { + } else if (tmpl == "llama2" || tmpl == "mistral" || tmpl_contains("[INST]")) { // llama2 template and its variants // [variant] support system message - bool support_system_message = tmpl.find("<>") != std::string::npos || tmpl == "mistral"; + bool support_system_message = tmpl_contains("<>") || tmpl == "mistral"; // [variant] space before + after response - bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos; + bool space_around_response = tmpl_contains("' ' + eos_token"); // [variant] add BOS inside history - bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos; + bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]"); // [variant] trim spaces from the input message - bool strip_message = tmpl.find("content.strip()") != std::string::npos; + bool strip_message = tmpl_contains("content.strip()"); // construct the prompt bool is_inside_turn = true; // skip BOS at the beginning ss << "[INST] "; @@ -19656,7 +19659,7 @@ static int32_t llama_chat_apply_template_internal( } } // llama2 templates seem to not care about "add_generation_prompt" - } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) { + } else if (tmpl == "phi3" || (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>"))) { // Phi 3 for (auto message : chat) { std::string role(message->role); @@ -19665,7 +19668,7 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|assistant|>\n"; } - } else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) { + } else if (tmpl == "zephyr" || tmpl_contains("<|user|>")) { // zephyr template for (auto message : chat) { ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n"; @@ -19673,7 +19676,7 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|assistant|>\n"; } - } else if (tmpl == "monarch" || tmpl.find("bos_token + message['role']") != std::string::npos) { + } else if (tmpl == "monarch" || tmpl_contains("bos_token + message['role']")) { // mlabonne/AlphaMonarch-7B template (the is included inside history) for (auto message : chat) { std::string bos = (message == chat.front()) ? "" : ""; // skip BOS for first message @@ -19682,7 +19685,7 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "assistant\n"; } - } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl.find("") != std::string::npos) { + } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl_contains("")) { // google/gemma-7b-it std::string system_prompt = ""; for (auto message : chat) { @@ -19704,7 +19707,7 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "model\n"; } - } else if (tmpl == "orion" || tmpl.find("'\\n\\nAssistant: ' + eos_token") != std::string::npos) { + } else if (tmpl == "orion" || tmpl_contains("'\\n\\nAssistant: ' + eos_token")) { // OrionStarAI/Orion-14B-Chat std::string system_prompt = ""; for (auto message : chat) { @@ -19724,7 +19727,7 @@ static int32_t llama_chat_apply_template_internal( ss << message->content << ""; } } - } else if (tmpl == "openchat" || tmpl.find("GPT4 Correct ") != std::string::npos) { + } else if (tmpl == "openchat" || tmpl_contains("GPT4 Correct ")) { // openchat/openchat-3.5-0106, for (auto message : chat) { std::string role(message->role); @@ -19738,13 +19741,13 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "GPT4 Correct Assistant:"; } - } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl.find("USER: ") != std::string::npos && tmpl.find("ASSISTANT: ") != std::string::npos)) { + } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: "))) { // eachadea/vicuna-13b-1.1 (and Orca variant) for (auto message : chat) { std::string role(message->role); if (role == "system") { // Orca-Vicuna variant uses a system prefix - if (tmpl == "vicuna-orca" || tmpl.find("SYSTEM: ") != std::string::npos) { + if (tmpl == "vicuna-orca" || tmpl_contains("SYSTEM: ")) { ss << "SYSTEM: " << message->content << "\n"; } else { ss << message->content << "\n\n"; @@ -19758,7 +19761,7 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "ASSISTANT:"; } - } else if (tmpl == "deepseek" || (tmpl.find("### Instruction:") != std::string::npos && tmpl.find("<|EOT|>") != std::string::npos)) { + } else if (tmpl == "deepseek" || (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>"))) { // deepseek-ai/deepseek-coder-33b-instruct for (auto message : chat) { std::string role(message->role); @@ -19773,7 +19776,7 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "### Response:\n"; } - } else if (tmpl == "command-r" || (tmpl.find("<|START_OF_TURN_TOKEN|>") != std::string::npos && tmpl.find("<|USER_TOKEN|>") != std::string::npos)) { + } else if (tmpl == "command-r" || (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>"))) { // CohereForAI/c4ai-command-r-plus for (auto message : chat) { std::string role(message->role); @@ -19788,7 +19791,7 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"; } - } else if (tmpl == "llama3" || (tmpl.find("<|start_header_id|>") != std::string::npos && tmpl.find("<|end_header_id|>") != std::string::npos)) { + } else if (tmpl == "llama3" || (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>"))) { // Llama 3 for (auto message : chat) { std::string role(message->role); @@ -19797,6 +19800,33 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|start_header_id|>assistant<|end_header_id|>\n\n"; } + } else if (tmpl == "minicpm" || tmpl_contains(u8"<用户>")) { + // MiniCPM-3B-OpenHermes-2.5-v2-GGUF + for (auto message : chat) { + std::string role(message->role); + if (role == "user") { + ss << u8"<用户>"; + ss << trim(message->content); + ss << ""; + } else { + ss << trim(message->content); + } + } + } else if (tmpl == "deepseek2" || tmpl_contains("'Assistant: ' + message['content'] + eos_token")) { + // DeepSeek-V2 + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + ss << message->content << "\n\n"; + } else if (role == "user") { + ss << "User: " << message->content << "\n\n"; + } else if (role == "assistant") { + ss << "Assistant: " << message->content << u8"<|end▁of▁sentence|>"; + } + } + if (add_ass) { + ss << "Assistant:"; + } } else { // template not supported return -1; diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index d19ba8633..b154038b2 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -57,7 +57,11 @@ int main(void) { //Phi-3-medium "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}", //Phi-3-vision - "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}" + "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}", + // MiniCPM-3B-OpenHermes-2.5-v2-GGUF + u8"{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + ''}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}", + // DeepSeek-V2 + "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}", }; std::vector expected_output = { // teknium/OpenHermes-2.5-Mistral-7B @@ -94,6 +98,10 @@ int main(void) { "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n", //Phi-3-vision "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n", + // MiniCPM-3B-OpenHermes-2.5-v2-GGUF + u8"You are a helpful assistant<用户>HelloHi there<用户>Who are youI am an assistant<用户>Another question", + // DeepSeek-V2 + u8"You are a helpful assistant\n\nUser: Hello\n\nAssistant: Hi there<|end▁of▁sentence|>User: Who are you\n\nAssistant: I am an assistant <|end▁of▁sentence|>User: Another question\n\nAssistant:", }; std::vector formatted_chat(1024); int32_t res; From 8748d8ac6f172b99826ab18f01d9a3a165987d54 Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Fri, 28 Jun 2024 18:02:05 +0100 Subject: [PATCH 28/29] json: attempt to skip slow tests when running under emulator (#8189) --- .github/workflows/build.yml | 1 + tests/test-json-schema-to-grammar.cpp | 40 +++++++++++++++------------ 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index adf67cecc..1e344db6b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -799,6 +799,7 @@ jobs: 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe) cd build + $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1 & $sde -future -- ctest -L main -C Release --verbose --timeout 900 - name: Determine tag name diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index 720a949c7..65486ac5c 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -1239,26 +1239,30 @@ int main() { } }); - if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python -c \"import sys; exit(1) if sys.version_info < (3, 8) else print('Python version is sufficient')\"") == 0)) { - test_all("Python", [](const TestCase & tc) { - write("test-json-schema-input.tmp", tc.schema); - tc.verify_status(std::system( - "python ./examples/json_schema_to_grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE); - tc.verify(read("test-grammar-output.tmp")); - }); + if (getenv("LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR")) { + fprintf(stderr, "\033[33mWARNING: Skipping slow tests on emulator.\n\033[0m"); } else { - fprintf(stderr, "\033[33mWARNING: Python not found (min version required is 3.8), skipping Python JSON schema -> grammar tests.\n\033[0m"); - } + if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python -c \"import sys; exit(1) if sys.version_info < (3, 8) else print('Python version is sufficient')\"") == 0)) { + test_all("Python", [](const TestCase & tc) { + write("test-json-schema-input.tmp", tc.schema); + tc.verify_status(std::system( + "python ./examples/json_schema_to_grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE); + tc.verify(read("test-grammar-output.tmp")); + }); + } else { + fprintf(stderr, "\033[33mWARNING: Python not found (min version required is 3.8), skipping Python JSON schema -> grammar tests.\n\033[0m"); + } - if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) { - test_all("JavaScript", [](const TestCase & tc) { - write("test-json-schema-input.tmp", tc.schema); - tc.verify_status(std::system( - "node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE); - tc.verify(read("test-grammar-output.tmp")); - }); - } else { - fprintf(stderr, "\033[33mWARNING: Node not found, skipping JavaScript JSON schema -> grammar tests.\n\033[0m"); + if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) { + test_all("JavaScript", [](const TestCase & tc) { + write("test-json-schema-input.tmp", tc.schema); + tc.verify_status(std::system( + "node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE); + tc.verify(read("test-grammar-output.tmp")); + }); + } else { + fprintf(stderr, "\033[33mWARNING: Node not found, skipping JavaScript JSON schema -> grammar tests.\n\033[0m"); + } } test_all("Check Expectations Validity", [](const TestCase & tc) { From 72272b83a3878e91251218c981b4c6ec16c33912 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 29 Jun 2024 00:14:20 +0200 Subject: [PATCH 29/29] fix code typo in llama-cli (#8198) --- examples/main/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index cfaf6a6e8..1114073b8 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -810,7 +810,7 @@ int main(int argc, char ** argv) { is_antiprompt = true; } - chat_add_and_format(model, chat_msgs, "system", assistant_ss.str()); + chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str()); is_interacting = true; printf("\n"); }