mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
* koboldcpp-ROCm Port commit 3416c986d9d9a31c3cdefd7e7bd4d9438d72ba35 Merge: 5eb17f04c4e435
Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Fri Aug 25 13:46:56 2023 -0500 Merge remote-tracking branch 'upstream/concedo' commit 5eb17f02c8638e003bb91bddf95ccf54d2ad0c12 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Fri Aug 25 13:38:21 2023 -0500 ROCm Port update * use hipblas based on cublas * Update Makefile for the Cuda kernels * Expand arch list and make it overrideable * Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5) * add hipBLAS to README * new build arg LLAMA_CUDA_MMQ_Y * fix half2 decomposition * Add intrinsics polyfills for AMD * AMD assembly optimized __dp4a * Allow overriding CC_TURING * use "ROCm" instead of "CUDA" * ignore all build dirs * Add Dockerfiles * fix llama-bench * fix -nommq help for non CUDA/HIP --------- Co-Authored-By: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Co-Authored-By: ardfork <134447697+ardfork@users.noreply.github.com> Co-Authored-By: funnbot <22226942+funnbot@users.noreply.github.com> Co-Authored-By: Engininja2 <139037756+Engininja2@users.noreply.github.com> Co-Authored-By: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Co-Authored-By: jammm <2500920+jammm@users.noreply.github.com> Co-Authored-By: jdecourval <7315817+jdecourval@users.noreply.github.com> commit b34f4bd2724733e188ec4f6074042f66a5ed28c9 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sat Aug 19 17:12:52 2023 -0500 Update README.md commit 7d1196108ad330b32845546fb3472c2172a0b6b8 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Mon Aug 14 23:03:12 2023 -0500 remove force DMMV commit cd61aa0d9e16627935c7978adf488a679ddfa745 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sat Aug 12 17:24:31 2023 -0500 restore main_gpu parameter commit 4a042f326830271a4c31104051b7b08e08ac234e Author: Henri Vasserman <henv@hot.ee> Date: Sat Aug 12 10:51:46 2023 +0300 gfx1100 support --------- Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com> Co-authored-by: jammm <2500920+jammm@users.noreply.github.com> Co-authored-by: jdecourval <7315817+jdecourval@users.noreply.github.com> commit 8913bc6fea97d3cb860937b0461f455c6abe3ea1 Author: Henri Vasserman <henv@hot.ee> Date: Fri Aug 11 10:16:02 2023 +0300 Allow overriding CC_TURING commit e77a4c37a756c002e97173f4122e088fb304e18a Author: Henri Vasserman <henv@hot.ee> Date: Fri Aug 11 10:00:07 2023 +0300 Merge 'origin/master' into hipblas commit cc4c4e355cd553b1557d5fba2562e824db93f9b4 Author: Engininja2 <139037756+Engininja2@users.noreply.github.com> Date: Fri Aug 11 09:43:14 2023 +0300 New __dp4a assembly Now compatible with gfx900 and faster as well. commit 1a03b709848ce68d5bf5966237756167e2cac540 Author: Henri Vasserman <henv@hot.ee> Date: Fri Aug 11 09:30:28 2023 +0300 Undo mess --------- Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com> commit 4366ff9ba1b1f12e494118ef9b5198479022fcc5 Author: DannyDaemonic <DannyDaemonic@gmail.com> Date: Thu Aug 10 13:11:36 2023 -0700 Handle `ENABLE_VIRTUAL_TERMINAL_PROCESSING` more gracefully on earlier versions of Windows. commit 811ff855a24323cafddc95c1b8aca711fef05f76 Author: Christian Demsar <crasm@git.vczf.us> Date: Thu Aug 10 10:28:27 2023 -0400 Add --n-predict -2 for stopping generation on full context (#2565) commit 37c9717aaa6815b6a5be21aaab970212f20fe6bf Author: Martin Krasser <krasserm@googlemail.com> Date: Thu Aug 10 12:16:38 2023 +0200 Fix grammar-based sampling issue in server (#2566) commit d18ecd5b9e5dde58ae08a3eef1637406159ddaca Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Thu Aug 10 13:19:41 2023 -0500 make mmq gen faster for amd commit 243894a952147a4fac5b6aee748861a0df6cc2c6 Author: Henri Vasserman <henv@hot.ee> Date: Thu Aug 10 12:14:40 2023 +0300 ws fix commit ac2f14da445ea87d73539adbd29d19ff2c9eba58 Author: Engininja2 <139037756+Engininja2@users.noreply.github.com> Date: Thu Aug 10 12:11:27 2023 +0300 AMD assembly optimized __dp4a Doesn't seem to work for gfx900, so commented out. commit 9dba0c985f140ddded8cbb671f139e81fff82eed Author: Henri Vasserman <henv@hot.ee> Date: Thu Aug 10 12:09:28 2023 +0300 Fix merge --------- Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com> Co-authored-by: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> commit f570b5cb1070591527a82d94bba408927b37778d Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed Aug 9 22:11:20 2023 -0500 Revert "revert cuda changes as they are bugggy" This reverts commit 1541bf879772aeeed8ff646bfc52185c2a88b79b. commit 1541bf879772aeeed8ff646bfc52185c2a88b79b Author: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Wed Aug 9 22:36:41 2023 +0800 revert cuda changes as they are bugggy commit bacc20203efb1839aa313858a04d75255bb4b7f4 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed Aug 9 20:37:17 2023 -0500 Merge remote-tracking branch 'upstream/concedo' commit b7cb4cfd109986bd66e8fd382d1e2516eaddfebb Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed Aug 9 20:00:52 2023 -0500 additional fixes commit fadae727baa3735ad3e0667384d6e05ca056b3ef Merge: 518eb2a 8f8ab6c Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed Aug 9 18:45:50 2023 -0500 Merge branch 'hipblas' into develop4Main commit 518eb2af9225f8300a108c4244c7eb0a2217c3bc Merge: bda0215cae6a84
Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed Aug 9 18:32:10 2023 -0500 Merge remote-tracking branch 'upstream/concedo' into develop2Main commit bda0215b413bafc49890aa23fc35f96a191fb3e0 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed Aug 9 18:17:54 2023 -0500 update makefile to multisystem path commit 8f8ab6c4c049df501e9a5ed8fef3aa0fc0691421 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed Aug 9 18:05:03 2023 -0500 hipLDFLAG Path change Unix to multisystem in Makefile changed the hardcoded linux distro hipblas LD path from -L/opt/rocm/lib to use the defined ROCM_PATH variable to be flexible with ROCm on non-Linux OS commit 610ba4cfc460ed65c4adc32d3365a216690384d5 Merge: 4024f9125d43e0
Author: Henri Vasserman <henv@hot.ee> Date: Wed Aug 9 23:54:58 2023 +0300 Merge 'origin/master' into hipblas commit 4024f91a665d83b6de8658d45ec9d004c5d90c79 Author: Henri Vasserman <henv@hot.ee> Date: Wed Aug 9 01:56:44 2023 +0300 Add intrinsics polyfills for AMD --------- Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com> Co-authored-by: funnbot <22226942+funnbot@users.noreply.github.com> Co-authored-by: Engininja2 <139037756+Engininja2@users.noreply.github.com> commit ab6212864ce8e9af200bcedb3e0126ee49aa8d0a Merge: d91456af5bfea0
Author: Henri Vasserman <henv@hot.ee> Date: Wed Aug 9 00:37:01 2023 +0300 Merge 'origin/master' into hipblas commit ee9fa2aca4f2e6645b99702935b34a5f8ec8f05d Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed Aug 2 01:53:58 2023 -0500 Update Makefile commit d91456aaf138566fa0aa3d507964049c8a09499b Author: ardfork <134447697+ardfork@users.noreply.github.com> Date: Mon Jul 31 20:35:00 2023 +0300 fix half2 decomposition commit c1cb70d64d307d3fd9b7b9f61bb574e36520499a Author: Henri Vasserman <henv@hot.ee> Date: Mon Jul 31 19:56:44 2023 +0300 new build arg LLAMA_CUDA_MMQ_Y commit c1664a00ae98059df863a88cbcb13eeca3025742 Merge: 43362310728c5a
Author: Henri Vasserman <henv@hot.ee> Date: Mon Jul 31 19:32:27 2023 +0300 Merge 'origin/master' into hipblas commit 848558d7d95a5036ac057efdefa9b2a2e6fb61b7 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sun Jul 30 20:02:52 2023 -0500 import vars logic fix commit b650b849d52aac65364558521f76e75ded7ea590 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sun Jul 30 00:21:36 2023 -0500 Update easy_KCPP-ROCm_install.sh commit 8573a67a29e813d82e7f032912a8c221cd199505 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sat Jul 29 21:31:12 2023 -0500 remove duplicate code and fix typo remove duplicate tooltip commit 430986e3f68f599fd7a11ea4b2b8e45ef33da643 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sat Jul 29 21:07:34 2023 -0500 hide "missing" if all are built move tooltip functions to helper functions section. hides the string "Missing: ..." from showing if all backends are available " if len(runopts)==6 else + " commit dd0db7265dbc0b0699ca861291006808b662b0e4 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sat Jul 29 20:52:31 2023 -0500 hide "missing" if all are built move tooltip functions to helper functions section. hides the string "Missing: ..." from showing if all backends are available commit 43fffb66d8a30cbd776c3682f8a104c3644206b1 Merge: 0ed65a4b40550c
Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sat Jul 29 19:13:15 2023 -0500 Merge branch 'concedo' commit 0ed65a44a5fdb529611730f276a4b910cbf70ae0 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sat Jul 29 18:34:21 2023 -0500 Hide unavailable backends & Add tooltip over backend count Hides unavailable backends from the user and if the program is launched without any backends made, it shows an error message to them stating no backends were found and to make them using the 'make' command Add tooltip when hovering over backend count label hovering over the new label that shows the backend count will explain what the numbers are, and show the users which backends are not available or built commit 2a263983ab35024a95c411995963182ada06ed6f Merge: cee2e9d31486eb
Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sat Jul 29 15:16:33 2023 -0500 Merge remote-tracking branch 'upstream/concedo' commit 4336231a32a0c6168da5d79801752289622e9e58 Author: Henri Vasserman <henv@hot.ee> Date: Sat Jul 29 18:35:56 2023 +0300 add hipBLAS to README --------- Co-authored-by: ardfork <134447697+ardfork@users.noreply.github.com> commit f8e3fc6c746b37d69656fb5ae6af8e411d85dbca Author: Henri Vasserman <henv@hot.ee> Date: Sat Jul 29 14:16:46 2023 +0300 rocblas init stuff commit d2ade639f4339e786311effb3eafca8bfc360d56 Merge: cde52d68a88e58
Author: Henri Vasserman <henv@hot.ee> Date: Sat Jul 29 12:59:48 2023 +0300 Merge 'origin/master' into hipblas commit cee2e9d76740fd8e8f50b612078f3e7658460f29 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed Jul 26 23:36:55 2023 -0500 Only Show Available Backends in GUI Hides unavailable backends from the user and if the program is launched without any backends made, it shows an error message to them stating no backends were found and to make them using the 'make' command commit 78636109fc2ded79ee3e9a44d2e3c2d63a8de70e Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed Jul 26 13:27:22 2023 -0500 Update easy_KCPP-ROCm_install.sh commit 731cd6e2ab9bb722e211142bb633e7018ccdb31b Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Tue Jul 25 22:39:50 2023 -0500 Create easy_rocm_install.sh commit f154685bbdc79b5ace752fbc179e32f2f7806bdb Merge: cbdc1f394e0a06
Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Tue Jul 25 22:25:10 2023 -0500 Merge branch 'concedo_experimentalMAIN' commit cbdc1f3fb91969e79bc8640e0cebfc3247e200df Merge: 5b838d49731682
Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Mon Jul 24 16:53:21 2023 -0500 Merge remote-tracking branch 'upstream/concedo' commit cde52d6a63f13f46d6403cc2957f4b4c34ddf4e2 Merge: 8e8054a84e09a7
Author: Henri Vasserman <henv@hot.ee> Date: Mon Jul 24 12:22:58 2023 +0300 Merge 'origin/master' into hipblas commit 8e8054ad83e794b261914ad4f337d43e2c76882d Author: Henri Vasserman <henv@hot.ee> Date: Mon Jul 24 12:20:49 2023 +0300 Add rocblas to build files commit 1f6294dc4473701b5be791d47e4b3733f95dbc0a Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Mon Jul 24 03:52:01 2023 -0500 Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5) * initialize rocblas commit 5b838d47874536ebffc2f6cb25877e0476a9402d Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Mon Jul 24 03:10:35 2023 -0500 amd multigpu full layer offload w/o vram scratch commit 9bfb2fdd68000670bda85c4e9748d72f5af09764 Merge: b379f9d66328fc
Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Mon Jul 24 03:07:44 2023 -0500 Merge branch 'concedo_experimental' commit b379f9d6fac570c220c928ff5f4ba4ed1ca7c051 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Mon Jul 24 03:07:00 2023 -0500 Revert "amd multigpu full layer offload w/o vram scratch" This reverts commit 9adfc8e33f7116d6ae2e0992920733f783b70d08. commit 9adfc8e33f7116d6ae2e0992920733f783b70d08 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Mon Jul 24 02:56:40 2023 -0500 amd multigpu full layer offload w/o vram scratch commit 05c792e622a1d9838f9343e04f79ddf2bb63ae96 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Mon Jul 24 00:18:48 2023 -0500 initialize rocblas commit ade68d09d7b63d3344e18b6193043b378671eb12 Merge: 521ad6b56995ca
Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sun Jul 23 20:25:05 2023 -0500 Merge remote-tracking branch 'upstream/concedo' commit 521ad6b5cb2a107ad7b972025aeb0f353e0cac67 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Thu Jul 20 21:42:33 2023 -0500 lazy import_var error handling for saves commit 9553e52e7e4eabe46312729f6c4effeef6390df7 Merge: cac6650f036109
Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Thu Jul 20 19:59:41 2023 -0500 Merge remote-tracking branch 'upstream/concedo' commit cac6650754502208abfead61ba169fefc5ae84ac Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Mon Jul 17 23:05:02 2023 -0500 Makefile fix! Allows hip/clblast build together commit 3db70b5f0a1a4a1207041ddc5f2c5e25306bad4d Merge: 2ec44667568d1a
Author: Henri Vasserman <henv@hot.ee> Date: Tue Jul 18 01:54:17 2023 +0300 Merge 'origin/master' into hipblas commit f208670ffb6cdbb1e225adfb2fd80a67a6dc5055 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Fri Jul 14 02:56:03 2023 -0500 improve error handling with gpu names commit 860e73845f61fe0afb6a26cc8054d8be1f9e3669 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Fri Jul 14 00:33:03 2023 -0500 Show GPU names in GUI, Only show GPUs that exist changed the pre-set 1,2,3 and 1,2,3,all settings that the GPU selector had and replaced them with a function that grabs the GPU names and sets the names as the values for the selector boxes. commit 2ec4466db54fd2f42f2ab7713cc1061e0cf59bf3 Author: Henri Vasserman <henv@hot.ee> Date: Thu Jul 13 13:44:02 2023 +0300 Update build flags. GGML_CUDA_DMMV_Y is now GGML_CUDA_MMV_Y so update your build instructions. GGML_CUDA_FORCE_DMMV is always enabled. --------- Co-authored-by: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> commit cd36b185ff6de91abbfd1b80366dd79a1303a878 Merge: afcb8fe1cbf561
Author: Henri Vasserman <henv@hot.ee> Date: Thu Jul 13 13:03:01 2023 +0300 Merge 'origin/master' into hipblas commit ac7ebc3ac1deedfbc2940443b26774f1b4c85fae Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed Jul 12 18:32:18 2023 -0500 add hipBLAS name scheme to GUI and update README commit 7f85cc5ac30f2f300ca817a489ef209c995c634b Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed Jul 12 17:35:54 2023 -0500 update makefile and ggml.c commit 6ca3499275ba168320424f06ab3301ec329a6a83 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed Jul 12 15:43:45 2023 -0500 ggml.c fix commit 770e674aa5b2a1a9ffff2888a12e27b04ccfc7ef Merge: 2b289cd5941514
Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed Jul 12 15:24:36 2023 -0500 Merge remote-tracking branch 'upstream/concedo' commit 2b289cde558310c6c67dfc8d508c04e634595716 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed Jul 12 14:30:00 2023 -0500 Update c-cpp.yml commit 5dae95a9bb486c7f720789dffde1cfb470bffce0 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed Jul 12 14:28:51 2023 -0500 Update c-cpp.yml commit b37cd738c84debb53b149f5a9fb73de958f263fd Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed Jul 12 14:27:04 2023 -0500 Create c-cpp.yml to test Actions commit afcb8fe0c4f5e918422ea41d08824653d58575ed Author: Henri Vasserman <henv@hot.ee> Date: Tue Jul 11 18:09:27 2023 +0300 Add new config option commit 8c2c4978a32d671253809d8f0f09d98af2dd18ab Merge: e6104662347463
Author: Henri Vasserman <henv@hot.ee> Date: Tue Jul 11 17:53:54 2023 +0300 Merge 'origin/master' into hipblas commit e610466307abc8f8bae641682ab3f91dbc33930e Author: Henri Vasserman <henv@hot.ee> Date: Tue Jul 11 17:53:14 2023 +0300 Expand arch list and make it overrideable commit 80e4e548bfbace2a966a58cb57dd1720ad7216b2 Merge: 7735c5a1d16309
Author: Henri Vasserman <henv@hot.ee> Date: Mon Jul 10 02:09:28 2023 +0300 Merge 'origin/master' into hipblas commit 8432e9d5dc8d080535243467f8d380271e8d9489 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sun Jul 9 16:55:30 2023 -0500 Update Makefile commit b58c1893fa839c0f35df96f6a8b026a7f2576762 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sun Jul 9 16:20:00 2023 -0500 Add multi-gpu CuBLAS support to new GUI commit 0c1c71b9927127b45030fe88283dfbdd23853d34 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sat Jul 8 07:56:57 2023 -0500 Update Makefile commit f864f60cd8e563e2594cee5a7da7e9aebed494f9 Author: Johannes Gäßler <johannesg@5d6.de> Date: Sat Jul 8 00:25:15 2023 +0200 CUDA: add __restrict__ to mul mat vec kernels (#2140) commit 4539bc2761a7a23b588b5420b9d3fd1962ff63e5 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sat Jul 8 01:36:14 2023 -0500 update makefile for changes commit 912e31ec523eac9ef308f0d28bc2d93aab7c3ecb Merge: 74e2703ddaa4f2
Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Fri Jul 7 23:15:37 2023 -0500 Merge remote-tracking branch 'upstream/concedo' commit 74e2703ac3b1557f107e540657d0919db115f913 Merge: cf65429f9108ba
Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed Jul 5 15:16:49 2023 -0500 Merge branch 'LostRuins:concedo' into main commit 7735c5a9af58f6713b54fd5a4b6463f3b116d44d Merge: c3e37337ee76e4
Author: Henri Vasserman <henv@hot.ee> Date: Tue Jul 4 17:09:16 2023 +0300 Merge 'origin/master' into hipblas commit cf65429c3832d32a8c17c7ed5ab47066d7511fbe Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Mon Jul 3 16:56:40 2023 -0500 print cuda or opencl based on what's used commit 72c16d2310b2e4c44018e2084aeb79e68c0b8709 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Mon Jul 3 16:45:39 2023 -0500 Revert "fix my mistake that broke other arches" This reverts commit 777aed5e69e240a54e7d3da962d8520855f072b9. commit 777aed5e69e240a54e7d3da962d8520855f072b9 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Mon Jul 3 15:53:32 2023 -0500 fix my mistake that broke other arches commit 27780a987a8dabb18689038c0397e16f2f219c7e Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sun Jul 2 16:03:27 2023 -0500 rocm fixes commit f52c7d439770c1ea0bebc1f895b74d6aeea5f0a6 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sun Jul 2 16:02:58 2023 -0500 Revert "rocm fixes" This reverts commit 2fe9927353a1e53353623f850d3d534da88f5154. commit 2fe9927353a1e53353623f850d3d534da88f5154 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sun Jul 2 15:58:21 2023 -0500 rocm fixes commit efe7560c83a497f5e750bbe27922babd4233bda9 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sun Jul 2 15:55:43 2023 -0500 Revert "move HIPBLAS definitions into ggml-cuda.h" This reverts commit bf49a93d63f833b7871ba6e60f8fe207562678ee. commit 4fc0181e44685019dcd309d4bb345cac7a5fef87 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sun Jul 2 15:55:36 2023 -0500 Revert "move hipblas definitions to header files" This reverts commit 2741ffb70464a71fd138484de4b41da05622e027. commit 89eb576f2771bd81a3a6274348b47535dfdd5f63 Merge: 2741ffb3d2907d
Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sun Jul 2 14:44:13 2023 -0500 Merge branch 'LostRuins:concedo' into main commit c3e3733c61f7705ea00fd593ee94527da8c12f1b Author: Henri Vasserman <henv@hot.ee> Date: Sun Jul 2 15:51:31 2023 +0300 ROCm fixes commit 15db19ae7b70d2a6350063e633b898a89ad78cbc Merge: 04419f146088f7
Author: Henri Vasserman <henv@hot.ee> Date: Sun Jul 2 15:39:57 2023 +0300 Merge 'origin/master' into hipblas commit 2741ffb70464a71fd138484de4b41da05622e027 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sat Jul 1 17:07:42 2023 -0500 move hipblas definitions to header files commit bf49a93d63f833b7871ba6e60f8fe207562678ee Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sat Jul 1 16:38:50 2023 -0500 move HIPBLAS definitions into ggml-cuda.h commit 540f4e05f4e95378f46a83e2919d3962c0ef9eac Merge: 2c3b46feda663f
Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sat Jul 1 14:58:32 2023 -0500 Merge remote-tracking branch 'upstream/concedo' commit 2c3b46f8a80ca9d94b2d3d06e1af6b6f7b791914 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Thu Jun 29 18:43:43 2023 -0500 changes to fix build commit c9e1103da0d72fd39a36391ac4b5d941a133598a Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Thu Jun 29 18:20:07 2023 -0500 Update ggml_v2-cuda-legacy.cu for ROCM commit b858fc5db80ed545a6fbeae3d551bddb47955598 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Thu Jun 29 17:49:39 2023 -0500 changes to work with upstream commit 69a0c2534bb8825f4009760b12d9bd44d108c6ed Merge: 096f0b01347d3a
Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Thu Jun 29 16:59:06 2023 -0500 Merge remote-tracking branch 'upstream/concedo' commit 04419f18947e7b0dc43c07869eac3965f22b34cf Merge: bb16effd3494bb
Author: Henri Vasserman <henv@hot.ee> Date: Wed Jun 28 23:30:10 2023 +0300 Merge 'origin/master' into hipblas commit bb16effc750e2706050f5d4ec89cecc42cc13882 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed Jun 28 15:27:10 2023 -0500 headers fix; add kquants_iter for hipblas and add gfx803 (#1) * kquants_iter for hipblas and add gfx803 * Update CMakeLists.txt with hipblas kquants_iter and DMMV_F16 * remove dmmv_f16 for now commit 096f0b055e11b7d930842f86146d0e5013c5dce6 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed Jun 28 15:27:02 2023 -0500 revert unnecessary hipblas conditionals commit d81e81adffd6eb59e280ae1885864bb5fbd9bba6 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed Jun 28 14:48:23 2023 -0500 Update Makefile hipblas nvcc correction commit c8ae94524a8bd7dca891b6b711cb5598a30fcf74 Merge: c1e5c830be54f7
Author: Henri Vasserman <henv@hot.ee> Date: Tue Jun 27 10:50:37 2023 +0300 Merge 'origin/master' into hipblas commit 2579ecf8db9569d7756161f05ce7b0f5f23174b0 Merge: abed427d2034ce
Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sun Jun 25 17:50:04 2023 -0500 Merge branch 'LostRuins:concedo' into main commit c1e5c8345eca45563d382d9417b84ed5f0ab77ff Merge: 35a6031447ccbe
Author: Henri Vasserman <henv@hot.ee> Date: Sun Jun 25 21:40:05 2023 +0300 Merge 'origin/master' into hipblas commit 35a603161a17ddeb6128e9d4718b8fab5e34b558 Merge: df7346c66a2555
Author: Henri Vasserman <henv@hot.ee> Date: Sun Jun 25 10:57:48 2023 +0300 Merge 'origin/master' into hipblas commit abed427b6f370698fe8e8409e7980f238aad03ef Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sat Jun 24 19:16:30 2023 -0500 reorganize If statements to include proper headers commit 06c3bf03b92c2e00fc4bcd27f0c34f32c58b19a9 Merge: ea6d3208342fe8
Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sat Jun 24 16:57:20 2023 -0500 Merge branch 'LostRuins:concedo' into main commit ea6d3208dcdc0b05e2c164dde8ee0bfc6a02ad09 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Fri Jun 23 01:53:28 2023 -0500 Update README.md commit 4d56ad8158595d1e835cb379939dc5526deb39e2 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Thu Jun 22 16:19:43 2023 -0500 Update README.md commit 21f930872b6e232679fe02eac9e429367365c6af Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Thu Jun 22 15:42:05 2023 -0500 kquants_iter for hipblas and add gfx803 commit df7346ccd52bc0368eeeb878e31a284e01eac61a Merge: 5dd2fbe7487137
Author: Henri Vasserman <henv@hot.ee> Date: Thu Jun 22 20:51:09 2023 +0300 Merge 'origin/master' into hipblas commit b6ff89066bbf2de23dab90bc8bbf9f63d8d1e070 Merge: eb094f0e6ddb15
Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Thu Jun 22 12:42:09 2023 -0500 Merge branch 'LostRuins:concedo' into main commit eb094f043f9b0b94e7db028ca36e96ce479b0369 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed Jun 21 23:59:18 2023 -0500 lowvram parameter description commit 3a5dfeb568d543376910180caa9a99b081fef9d4 Merge: 665cc11b1f00fa
Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed Jun 21 16:53:03 2023 -0500 Merge branch 'LostRuins:concedo' into koboldcpp-rocm commit 665cc1136b188e7ff5c1aa1359118c999ff6d162 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Wed Jun 21 01:13:19 2023 -0500 add lowvram parameter commit 222cbbb141f7ce79884cafb6bcebd860ae27cc04 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Tue Jun 20 19:03:28 2023 -0500 add additional hipblas conditions for cublas commit e1f958124ec99525cb58d8c534f9d1789377544e Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Tue Jun 20 16:51:59 2023 -0500 Add hip def for cuda v2 commit 3bff5c0f0defd9d49b770c5ce107c71e5cba8003 Merge: a7e74b3266d47a
Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Tue Jun 20 13:38:06 2023 -0500 Merge branch 'LostRuins:concedo' into koboldcpp-rocm commit a7e74b39fe5eedf85d955fe5ea5f4c546322a9b0 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Mon Jun 19 22:04:18 2023 -0500 Update README.md commit 5e99b3cb72d83f45b3f7904ffb8f242e743a142c Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Mon Jun 19 22:03:42 2023 -0500 Update Makefile commit 9190b17432ebdc489ab05b71df6c3b8d5e7f5895 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Mon Jun 19 21:47:10 2023 -0500 Update README.md commit 5dd2fbe6ea87f78e38d888844a3820302a297048 Merge: 67e229b20568fe
Author: Henri Vasserman <henv@hot.ee> Date: Tue Jun 20 01:23:12 2023 +0300 Merge 'origin/master' into hipblas commit 2780ea292b1e9c6ead274de3afb34337716be08f Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sun Jun 18 15:48:00 2023 -0500 Update Makefile commit 04a3e64807a92c2e105af92f16dd6db2ea024d39 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sun Jun 18 14:33:39 2023 -0500 remove extra line commit cccbca9dea3780e797a3b4972ba211e0c762fdc1 Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sun Jun 18 14:31:17 2023 -0500 attempt adding ROCM hipblas commit a44a1d4b90ed11d83d622eb976a945ff26a8974e Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sun Jun 18 14:31:01 2023 -0500 attempt adding ROCM hipblas commit b08818416972f83349bc4d6479bccc55ee31436d Author: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sun Jun 18 14:30:54 2023 -0500 attempt adding ROCM hipblas commit 67e229b7ca0a51f367c1e1495a15c261d0893d25 Merge: 6f7c156b241649
Author: Henri Vasserman <henv@hot.ee> Date: Sun Jun 18 00:36:54 2023 +0300 Merge 'origin/master' into hipblas commit 6f7c15637a8ed60d5d5dade24aaf63a296bc32a6 Merge: 61df8e9fc45a81
Author: Henri Vasserman <henv@hot.ee> Date: Sat Jun 17 16:53:22 2023 +0300 Merge 'origin/master' into hipblas commit 61df8e92179b84af9041e53f61d0194dfd791de0 Author: Henri Vasserman <henv@hot.ee> Date: Wed Jun 14 22:46:10 2023 +0300 add cudaMemset commit a836529996845343dfb96becb4fd48e3f55da55c Merge: 85f902d254a7a7
Author: Henri Vasserman <henv@hot.ee> Date: Wed Jun 14 22:41:55 2023 +0300 Merge 'origin/master' into hipblas commit 85f902d5c44cee18858812212dad850b9409c7f9 Merge: 4362e80b50b570
Author: Henri Vasserman <henv@hot.ee> Date: Thu Jun 8 10:50:28 2023 +0300 Merge 'origin/master' into hipblas commit 4362e805a4b0bd80d0cff0e3d8d0b1162cc8043c Merge: fa5b3d717366df
Author: Henri Vasserman <henv@hot.ee> Date: Tue Jun 6 23:14:40 2023 +0300 Merge 'origin/master' into hipblas commit fa5b3d7365266a9903450c1105551ffec7f51d92 Author: Henri Vasserman <henv@hot.ee> Date: Tue Jun 6 18:47:00 2023 +0300 fix makefile. commit 1ba4ce4ad792f9672eecc37bf982386d3a007914 Author: Henri Vasserman <henv@hot.ee> Date: Tue Jun 6 18:41:08 2023 +0300 Revert "warp size fixes" It seems like 32 is faster for me, at least and it won't cause so many conflicts. This reverts commit 5d6eb72164e5ae000d07dd725e635faa7a2f723d. commit 5d6eb72164e5ae000d07dd725e635faa7a2f723d Author: Henri Vasserman <henv@hot.ee> Date: Tue Jun 6 18:32:41 2023 +0300 warp size fixes commit 33091a9bd3bb3ecf59b0f5535b084f443f6a20b6 Merge: 9fdaa1d2d43387
Author: Henri Vasserman <henv@hot.ee> Date: Tue Jun 6 16:19:23 2023 +0300 Merge 'origin/master' into hipblas commit 9fdaa1d2501a2c4a030af6d34e97b2e4766b27c4 Author: Henri Vasserman <henv@hot.ee> Date: Sat May 27 19:17:53 2023 +0300 Add more defs For forward compatibility #1607 commit a4648c1e7c70b4985393ec0851403ef7fb8d1ffc Merge: 4c8b3fb0ecb1bb
Author: Henri Vasserman <henv@hot.ee> Date: Sat May 27 18:22:39 2023 +0300 Merge 'origin/master' into hipblas commit 4c8b3fb1071dff0cd0c4b4f96e506294ba6473f4 Author: Henri Vasserman <henv@hot.ee> Date: Fri May 26 01:08:53 2023 +0300 add configurable vars commit 30d921af3e0b21f511652c98448ccb631434d0d4 Author: Henri Vasserman <henv@hot.ee> Date: Fri May 26 01:03:56 2023 +0300 and makefile commit a593a4f6c24389528a5eed8e6dc86eb06ced38b8 Author: Henri Vasserman <henv@hot.ee> Date: Fri May 26 00:55:28 2023 +0300 Add missing parameters commit 174bf6a86d045a30b1253cbe3cc773808b202186 Merge: f80ce7a1fcdcc2
Author: Henri Vasserman <henv@hot.ee> Date: Fri May 26 00:44:23 2023 +0300 Merge 'origin/master' into hipblas commit f80ce7a4e00b33adf6b13d231689dbf3a33ec475 Merge: 600ace3ac7876a
Author: Henri Vasserman <henv@hot.ee> Date: Thu May 25 00:02:50 2023 +0300 Merge branch 'origin/master' into hipblas commit 600ace39c8f1d311b8f3c49003f5a6448a44b18e Author: Henri Vasserman <henv@hot.ee> Date: Sat May 20 23:42:20 2023 +0300 update warp size commit b19fefef943d974db2eda8a8908e67e1d08e317c Author: Henri Vasserman <henv@hot.ee> Date: Sat May 20 23:28:08 2023 +0300 Forwardcompat commit c66115b833178ea3711543ddbbd4eb2b21ab523e Merge:a0b2d5f
b8ee340
Author: Henri Vasserman <henv@hot.ee> Date: Sat May 20 18:29:31 2023 +0300 Merge 'origin/master' into hipblas commita0b2d5f291
Merge:8bab456
2a5ee02
Author: Henri Vasserman <henv@hot.ee> Date: Tue May 16 17:08:29 2023 +0300 Merge 'origin/master' into hipblas commit8bab45611e
Merge:2956630
b5c9295
Author: Henri Vasserman <henv@hot.ee> Date: Mon May 15 00:01:12 2023 +0300 Merge 'origin/master' into hipblas commit2956630a3d
Merge:0fe6384
f048af0
Author: Henri Vasserman <henv@hot.ee> Date: Sat May 13 13:12:52 2023 +0300 Merge 'origin/master' into hipblas commit0fe6384755
Author: Henri Vasserman <henv@hot.ee> Date: Fri May 12 17:22:11 2023 +0300 fix makefile commit605560d9ec
Merge:127f68e
089b1c9
Author: Henri Vasserman <henv@hot.ee> Date: Fri May 12 16:12:53 2023 +0300 Merge 'origin/master' into hipblas commit127f68eb5a
Merge:070cbcc
b608b55
Author: Henri Vasserman <henv@hot.ee> Date: Thu May 11 20:21:27 2023 +0300 Merge 'origin/master' into hipblas commit070cbcc1bd
Author: Henri Vasserman <henv@hot.ee> Date: Sun May 7 18:10:56 2023 +0300 occupanct function commita3296d50aa
Merge:0aefa6a
e129551
Author: Henri Vasserman <henv@hot.ee> Date: Sun May 7 18:06:04 2023 +0300 Merge 'origin/master' into hipblas commit0aefa6ab71
Merge:baeb482
1b0fd45
Author: Henri Vasserman <henv@hot.ee> Date: Sun May 7 12:24:41 2023 +0300 Merge 'origin/master' into hipblas commitbaeb482a94
Author: Henri Vasserman <henv@hot.ee> Date: Sun May 7 12:24:12 2023 +0300 Revert to default copy commit289073a532
Merge:1107194
173d0e6
Author: Henri Vasserman <henv@hot.ee> Date: Sat May 6 19:59:41 2023 +0300 Merge 'origin/master' into hipblas commit1107194e6b
Merge:04c0d48
a3b85b2
Author: Henri Vasserman <henv@hot.ee> Date: Sat May 6 00:38:20 2023 +0300 Merge 'origin/master' into hipblas commit04c0d480d7
Author: Henri Vasserman <henv@hot.ee> Date: Thu May 4 12:31:16 2023 +0300 Move all HIP stuff to ggml-cuda.cu commitd83cfbad0c
Merge:b67cc50
799fdc1
Author: Henri Vasserman <henv@hot.ee> Date: Thu May 4 11:31:16 2023 +0300 Merge 'origin/master' into hipblas commitb67cc50dad
Merge:fcbc262
e216aa0
Author: Henri Vasserman <henv@hot.ee> Date: Wed May 3 15:04:51 2023 +0300 Merge 'origin/master' into hipblas commitfcbc262eb9
Merge:c73def1
f4cef87
Author: Henri Vasserman <henv@hot.ee> Date: Mon May 1 22:45:29 2023 +0300 Merge 'origin/master' into hipblas commitc73def129a
Merge:d8ea75e
f0d70f1
Author: Henri Vasserman <henv@hot.ee> Date: Sun Apr 30 18:40:42 2023 +0300 Merge 'origin/master' into hipblas commitd8ea75e952
Merge:d194586
334637e
Author: Henri Vasserman <henv@hot.ee> Date: Sat Apr 29 11:25:51 2023 +0300 Merge 'origin/master' into hipblas commitd194586f65
Merge:2ab9d11
7f15c5c
Author: Henri Vasserman <henv@hot.ee> Date: Fri Apr 28 23:03:52 2023 +0300 Merge 'origin/master' into hipblas commit2ab9d11f37
Merge:3b4a531
04aaae1
Author: Henri Vasserman <henv@hot.ee> Date: Fri Apr 28 16:30:05 2023 +0300 Merge 'origin/master' into hipblas commit3b4a53138f
Merge:a1caa48
0b2da20
Author: Henri Vasserman <henv@hot.ee> Date: Fri Apr 28 10:08:41 2023 +0300 Merge 'origin/master' into hipblas commita1caa48611
Author: Henri Vasserman <henv@hot.ee> Date: Fri Apr 28 10:08:21 2023 +0300 add more cuda defines This is so 'slaren/cuda-f16f32' would merge. commitecc056519f
Author: Henri Vasserman <henv@hot.ee> Date: Fri Apr 28 01:58:27 2023 +0300 only .cu file needs to be complied as device commitef51e9ecac
Merge:d571d16
4afcc37
Author: Henri Vasserman <henv@hot.ee> Date: Wed Apr 26 12:46:26 2023 +0300 Merge branch 'ggerganov:master' into hipblas commitd571d1629f
Merge:608aa33
dd0eabc
Author: Henri Vasserman <henv@hot.ee> Date: Tue Apr 25 21:15:33 2023 +0300 Merge 'origin/master' into hipblas commit608aa33d9f
Author: Henri Vasserman <henv@hot.ee> Date: Tue Apr 25 21:15:04 2023 +0300 change default GPU arch to match CMake commit3a004b2a01
Author: Henri Vasserman <henv@hot.ee> Date: Mon Apr 24 02:24:54 2023 +0300 add rpath commitdb7a01297e
Merge:3677235
284685f
Author: Henri Vasserman <henv@hot.ee> Date: Sun Apr 23 21:49:28 2023 +0300 Merge 'origin/master' into hipblas commit367723544c
Author: Henri Vasserman <henv@hot.ee> Date: Sat Apr 22 23:28:00 2023 +0300 More build file changes commitd3e1984ce0
Author: Henri Vasserman <henv@hot.ee> Date: Fri Apr 21 03:32:06 2023 +0300 add rpath commit0e005f7793
Author: Henri Vasserman <henv@hot.ee> Date: Fri Apr 21 02:13:00 2023 +0300 Build file changes Now HIP Clang is not required, the CMake scripts will configure the needed compiler, which can be system clang++. Also other code can still use GCC, but CMake will force the clang to link. commit54a63c10e8
Author: Henri Vasserman <henv@hot.ee> Date: Thu Apr 20 22:19:22 2023 +0300 Update Makefile for the Cuda kernels commit0fd8363adc
Author: Henri Vasserman <henv@hot.ee> Date: Thu Apr 20 02:04:00 2023 +0300 use hipblas based on cublas * Merge Fixes * readme merge fix * remove old ggmlv2 changes * bring ggml v2_cuda up to date with AMD changes * Revert ggml v2_cuda changes BC they werent needed This reverts commit3385dd4240
. * avoid launching subprocesses to get device names for now, but other than that seems to be working --------- Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
6298 lines
222 KiB
C++
6298 lines
222 KiB
C++
// Defines fileno on msys:
|
|
#ifndef _GNU_SOURCE
|
|
#define _GNU_SOURCE
|
|
#endif
|
|
|
|
#include "llama.h"
|
|
|
|
#include "ggml.h"
|
|
|
|
#include "ggml-alloc.h"
|
|
|
|
#ifdef GGML_USE_CUBLAS
|
|
#include "ggml-cuda.h"
|
|
#endif
|
|
#if defined(GGML_USE_CLBLAST)
|
|
#include "ggml-opencl.h"
|
|
#endif
|
|
|
|
#ifdef GGML_USE_METAL
|
|
# include "ggml-metal.h"
|
|
#endif
|
|
#ifdef GGML_USE_MPI
|
|
# include "ggml-mpi.h"
|
|
#endif
|
|
#ifdef GGML_USE_K_QUANTS
|
|
# ifndef QK_K
|
|
# ifdef GGML_QKK_64
|
|
# define QK_K 64
|
|
# else
|
|
# define QK_K 256
|
|
# endif
|
|
# endif
|
|
#endif
|
|
|
|
#ifdef __has_include
|
|
#if __has_include(<unistd.h>)
|
|
#include <unistd.h>
|
|
#if defined(_POSIX_MAPPED_FILES)
|
|
#include <sys/mman.h>
|
|
#endif
|
|
#if defined(_POSIX_MEMLOCK_RANGE)
|
|
#include <sys/resource.h>
|
|
#endif
|
|
#endif
|
|
#endif
|
|
|
|
#if defined(_WIN32)
|
|
#define WIN32_LEAN_AND_MEAN
|
|
#ifndef NOMINMAX
|
|
#define NOMINMAX
|
|
#endif
|
|
#include <windows.h>
|
|
#include <io.h>
|
|
#include <stdio.h> // for _fseeki64
|
|
#endif
|
|
|
|
#include <algorithm>
|
|
#include <array>
|
|
#include <cassert>
|
|
#include <cinttypes>
|
|
#include <climits>
|
|
#include <cstdarg>
|
|
#include <cstddef>
|
|
#include <cstdint>
|
|
#include <cstdio>
|
|
#include <cstring>
|
|
#include <ctime>
|
|
#include <fstream>
|
|
#include <initializer_list>
|
|
#include <map>
|
|
#include <memory>
|
|
#include <mutex>
|
|
#include <numeric>
|
|
#include <queue>
|
|
#include <random>
|
|
#include <regex>
|
|
#include <sstream>
|
|
#include <thread>
|
|
#include <unordered_map>
|
|
|
|
#if defined(_MSC_VER)
|
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
#endif
|
|
|
|
#ifdef __GNUC__
|
|
#ifdef __MINGW32__
|
|
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
|
#else
|
|
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
|
#endif
|
|
#else
|
|
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
|
#endif
|
|
|
|
//
|
|
// logging
|
|
//
|
|
|
|
LLAMA_ATTRIBUTE_FORMAT(2, 3)
|
|
static void llama_log_internal (llama_log_level level, const char* format, ...);
|
|
static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data);
|
|
|
|
#define LLAMA_LOG_INFO(...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__)
|
|
#define LLAMA_LOG_WARN(...) llama_log_internal(LLAMA_LOG_LEVEL_WARN , __VA_ARGS__)
|
|
#define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
|
|
|
|
//
|
|
// helpers
|
|
//
|
|
|
|
static size_t utf8_len(char src) {
|
|
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
|
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
|
|
return lookup[highbits];
|
|
}
|
|
|
|
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
|
std::string result;
|
|
for (size_t pos = 0; ; pos += search.length()) {
|
|
auto new_pos = s.find(search, pos);
|
|
if (new_pos == std::string::npos) {
|
|
result += s.substr(pos, s.size() - pos);
|
|
break;
|
|
}
|
|
result += s.substr(pos, new_pos - pos) + replace;
|
|
pos = new_pos;
|
|
}
|
|
s = std::move(result);
|
|
}
|
|
|
|
static void zeros(std::ofstream & file, size_t n) {
|
|
char zero = 0;
|
|
for (size_t i = 0; i < n; ++i) {
|
|
file.write(&zero, 1);
|
|
}
|
|
}
|
|
|
|
LLAMA_ATTRIBUTE_FORMAT(1, 2)
|
|
static std::string format(const char * fmt, ...) {
|
|
va_list ap;
|
|
va_list ap2;
|
|
va_start(ap, fmt);
|
|
va_copy(ap2, ap);
|
|
int size = vsnprintf(NULL, 0, fmt, ap);
|
|
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
|
|
std::vector<char> buf(size + 1);
|
|
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
|
GGML_ASSERT(size2 == size);
|
|
va_end(ap2);
|
|
va_end(ap);
|
|
return std::string(buf.data(), size);
|
|
}
|
|
|
|
//
|
|
// gguf constants (sync with gguf.py)
|
|
//
|
|
|
|
enum llm_arch {
|
|
LLM_ARCH_LLAMA,
|
|
LLM_ARCH_FALCON,
|
|
LLM_ARCH_GPT2,
|
|
LLM_ARCH_GPTJ,
|
|
LLM_ARCH_GPTNEOX,
|
|
LLM_ARCH_MPT,
|
|
LLM_ARCH_UNKNOWN,
|
|
};
|
|
|
|
static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
{ LLM_ARCH_LLAMA, "llama" },
|
|
{ LLM_ARCH_FALCON, "falcon" },
|
|
{ LLM_ARCH_GPT2, "gpt2" },
|
|
{ LLM_ARCH_GPTJ, "gptj" },
|
|
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
|
{ LLM_ARCH_MPT, "mpt" },
|
|
};
|
|
|
|
enum llm_kv {
|
|
LLM_KV_GENERAL_ARCHITECTURE,
|
|
LLM_KV_GENERAL_QUANTIZATION_VERSION,
|
|
LLM_KV_GENERAL_ALIGNMENT,
|
|
LLM_KV_GENERAL_NAME,
|
|
LLM_KV_GENERAL_AUTHOR,
|
|
LLM_KV_GENERAL_URL,
|
|
LLM_KV_GENERAL_DESCRIPTION,
|
|
LLM_KV_GENERAL_LICENSE,
|
|
LLM_KV_GENERAL_SOURCE_URL,
|
|
LLM_KV_GENERAL_SOURCE_HF_REPO,
|
|
|
|
LLM_KV_CONTEXT_LENGTH,
|
|
LLM_KV_EMBEDDING_LENGTH,
|
|
LLM_KV_BLOCK_COUNT,
|
|
LLM_KV_FEED_FORWARD_LENGTH,
|
|
LLM_KV_USE_PARALLEL_RESIDUAL,
|
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
|
|
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
|
LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
|
|
LLM_KV_ATTENTION_CLAMP_KQV,
|
|
LLM_KV_ATTENTION_LAYERNORM_EPS,
|
|
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
|
|
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
|
LLM_KV_ROPE_FREQ_BASE,
|
|
LLM_KV_ROPE_SCALE_LINEAR,
|
|
|
|
LLM_KV_TOKENIZER_MODEL,
|
|
LLM_KV_TOKENIZER_LIST,
|
|
LLM_KV_TOKENIZER_TOKEN_TYPE,
|
|
LLM_KV_TOKENIZER_SCORES,
|
|
LLM_KV_TOKENIZER_MERGES,
|
|
LLM_KV_TOKENIZER_BOS_ID,
|
|
LLM_KV_TOKENIZER_EOS_ID,
|
|
LLM_KV_TOKENIZER_UNK_ID,
|
|
LLM_KV_TOKENIZER_SEP_ID,
|
|
LLM_KV_TOKENIZER_PAD_ID,
|
|
LLM_KV_TOKENIZER_HF_JSON,
|
|
LLM_KV_TOKENIZER_RWKV,
|
|
};
|
|
|
|
static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
|
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
|
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
|
{ LLM_KV_GENERAL_NAME, "general.name" },
|
|
{ LLM_KV_GENERAL_AUTHOR, "general.author" },
|
|
{ LLM_KV_GENERAL_URL, "general.url" },
|
|
{ LLM_KV_GENERAL_DESCRIPTION, "general.description" },
|
|
{ LLM_KV_GENERAL_LICENSE, "general.license" },
|
|
{ LLM_KV_GENERAL_SOURCE_URL, "general.source_url" },
|
|
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source_hf_repo" },
|
|
|
|
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
|
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
|
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
|
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
|
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
|
|
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
|
|
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
|
{ LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
|
|
{ LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
|
|
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
|
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
|
|
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
|
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
|
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
|
|
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
|
{ LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
|
|
{ LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
|
|
{ LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
|
|
{ LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
|
|
{ LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" },
|
|
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
|
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
|
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
|
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
|
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
|
};
|
|
|
|
struct LLM_KV {
|
|
LLM_KV(llm_arch arch) : arch(arch) {}
|
|
|
|
llm_arch arch;
|
|
|
|
std::string operator()(llm_kv kv) const {
|
|
return ::format(LLM_KV_NAMES[kv].c_str(), LLM_ARCH_NAMES[arch].c_str());
|
|
}
|
|
};
|
|
|
|
enum llm_tensor {
|
|
LLM_TENSOR_TOKEN_EMBD,
|
|
LLM_TENSOR_POS_EMBD,
|
|
LLM_TENSOR_OUTPUT,
|
|
LLM_TENSOR_OUTPUT_NORM,
|
|
LLM_TENSOR_ROPE_FREQS,
|
|
LLM_TENSOR_ATTN_Q,
|
|
LLM_TENSOR_ATTN_K,
|
|
LLM_TENSOR_ATTN_V,
|
|
LLM_TENSOR_ATTN_QKV,
|
|
LLM_TENSOR_ATTN_OUT,
|
|
LLM_TENSOR_ATTN_NORM,
|
|
LLM_TENSOR_ATTN_NORM_2,
|
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
|
LLM_TENSOR_FFN_GATE,
|
|
LLM_TENSOR_FFN_DOWN,
|
|
LLM_TENSOR_FFN_UP,
|
|
LLM_TENSOR_FFN_NORM,
|
|
};
|
|
|
|
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
|
{
|
|
LLM_ARCH_LLAMA,
|
|
{
|
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
{ LLM_TENSOR_OUTPUT, "output" },
|
|
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
},
|
|
},
|
|
{
|
|
LLM_ARCH_FALCON,
|
|
{
|
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
{ LLM_TENSOR_OUTPUT, "output" },
|
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
{ LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
|
|
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
},
|
|
},
|
|
};
|
|
|
|
static llm_arch llm_arch_from_string(const std::string & name) {
|
|
for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
|
|
if (kv.second == name) {
|
|
return kv.first;
|
|
}
|
|
}
|
|
|
|
return LLM_ARCH_UNKNOWN;
|
|
}
|
|
|
|
// helper to handle gguf constants
|
|
// usage:
|
|
//
|
|
// const auto tn = LLM_TN(LLM_ARCH_LLAMA);
|
|
//
|
|
// std::string name = tn(LLM_TENSOR_OUTPUT); -> "output"
|
|
// std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias"); -> "token_embd.bias"
|
|
// std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3); -> "blk.3.attn_norm.weight"
|
|
//
|
|
struct LLM_TN {
|
|
LLM_TN(llm_arch arch) : arch(arch) {}
|
|
|
|
llm_arch arch;
|
|
|
|
std::string operator()(llm_tensor tensor) const {
|
|
return LLM_TENSOR_NAMES[arch].at(tensor);
|
|
}
|
|
|
|
std::string operator()(llm_tensor tensor, const std::string & suffix) const {
|
|
return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
|
|
}
|
|
|
|
std::string operator()(llm_tensor tensor, int bid) const {
|
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
|
|
}
|
|
|
|
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
|
|
return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
|
|
}
|
|
};
|
|
|
|
//
|
|
// gguf helpers
|
|
//
|
|
|
|
#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
|
|
{ \
|
|
const std::string skey(key); \
|
|
const int kid = gguf_find_key(ctx, skey.c_str()); \
|
|
if (kid >= 0) { \
|
|
enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
|
|
if (ktype != (type)) { \
|
|
throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \
|
|
} \
|
|
(dst) = func(ctx, kid); \
|
|
} else if (req) { \
|
|
throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
|
|
} \
|
|
}
|
|
|
|
//
|
|
// ggml helpers
|
|
//
|
|
|
|
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
|
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
|
|
|
if (plan.work_size > 0) {
|
|
buf.resize(plan.work_size);
|
|
plan.work_data = buf.data();
|
|
}
|
|
|
|
ggml_graph_compute(graph, &plan);
|
|
}
|
|
|
|
//
|
|
// llama helpers
|
|
//
|
|
|
|
#ifdef GGML_USE_CUBLAS
|
|
# define llama_host_malloc(n) ggml_cuda_host_malloc(n)
|
|
# define llama_host_free(data) ggml_cuda_host_free(data)
|
|
#elif GGML_USE_METAL
|
|
# define llama_host_malloc(n) ggml_metal_host_malloc(n)
|
|
# define llama_host_free(data) ggml_metal_host_free(data)
|
|
#else
|
|
# define llama_host_malloc(n) malloc(n)
|
|
# define llama_host_free(data) free(data)
|
|
#endif
|
|
|
|
#if defined(_WIN32)
|
|
static std::string llama_format_win_err(DWORD err) {
|
|
LPSTR buf;
|
|
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
|
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
|
|
if (!size) {
|
|
return "FormatMessageA failed";
|
|
}
|
|
std::string ret(buf, size);
|
|
LocalFree(buf);
|
|
return ret;
|
|
}
|
|
#endif
|
|
|
|
struct llama_buffer {
|
|
void * data = NULL;
|
|
size_t size = 0;
|
|
|
|
// fallback to malloc / free
|
|
// useful in cases where CUDA can try to allocate PINNED memory
|
|
bool fallback = false;
|
|
|
|
void resize(size_t n) {
|
|
llama_host_free(data);
|
|
|
|
data = llama_host_malloc(n);
|
|
if (!data) {
|
|
fallback = true;
|
|
data = malloc(n);
|
|
} else {
|
|
fallback = false;
|
|
}
|
|
|
|
GGML_ASSERT(data);
|
|
size = n;
|
|
}
|
|
|
|
~llama_buffer() {
|
|
if (data) {
|
|
if (fallback) { // NOLINT
|
|
free(data);
|
|
} else {
|
|
llama_host_free(data);
|
|
}
|
|
}
|
|
|
|
data = NULL;
|
|
}
|
|
};
|
|
|
|
struct llama_file {
|
|
// use FILE * so we don't have to re-open the file to mmap
|
|
FILE * fp;
|
|
size_t size;
|
|
|
|
llama_file(const char * fname, const char * mode) {
|
|
fp = std::fopen(fname, mode);
|
|
if (fp == NULL) {
|
|
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
|
}
|
|
seek(0, SEEK_END);
|
|
size = tell();
|
|
seek(0, SEEK_SET);
|
|
}
|
|
|
|
size_t tell() const {
|
|
#ifdef _WIN32
|
|
__int64 ret = _ftelli64(fp);
|
|
#else
|
|
long ret = std::ftell(fp);
|
|
#endif
|
|
GGML_ASSERT(ret != -1); // this really shouldn't fail
|
|
return (size_t) ret;
|
|
}
|
|
|
|
void seek(size_t offset, int whence) const {
|
|
#ifdef _WIN32
|
|
int ret = _fseeki64(fp, (__int64) offset, whence);
|
|
#else
|
|
int ret = std::fseek(fp, (long) offset, whence);
|
|
#endif
|
|
GGML_ASSERT(ret == 0); // same
|
|
}
|
|
|
|
void read_raw(void * ptr, size_t len) const {
|
|
if (len == 0) {
|
|
return;
|
|
}
|
|
errno = 0;
|
|
std::size_t ret = std::fread(ptr, len, 1, fp);
|
|
if (ferror(fp)) {
|
|
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
|
}
|
|
if (ret != 1) {
|
|
throw std::runtime_error(std::string("unexpectedly reached end of file"));
|
|
}
|
|
}
|
|
|
|
uint32_t read_u32() const {
|
|
uint32_t ret;
|
|
read_raw(&ret, sizeof(ret));
|
|
return ret;
|
|
}
|
|
|
|
void write_raw(const void * ptr, size_t len) const {
|
|
if (len == 0) {
|
|
return;
|
|
}
|
|
errno = 0;
|
|
size_t ret = std::fwrite(ptr, len, 1, fp);
|
|
if (ret != 1) {
|
|
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
|
}
|
|
}
|
|
|
|
void write_u32(std::uint32_t val) const {
|
|
write_raw(&val, sizeof(val));
|
|
}
|
|
|
|
~llama_file() {
|
|
if (fp) {
|
|
std::fclose(fp);
|
|
}
|
|
}
|
|
};
|
|
|
|
struct llama_mmap {
|
|
void * addr;
|
|
size_t size;
|
|
|
|
llama_mmap(const llama_mmap &) = delete;
|
|
|
|
#ifdef _POSIX_MAPPED_FILES
|
|
static constexpr bool SUPPORTED = true;
|
|
|
|
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
|
|
size = file->size;
|
|
int fd = fileno(file->fp);
|
|
int flags = MAP_SHARED;
|
|
// prefetch/readahead impairs performance on NUMA systems
|
|
if (numa) { prefetch = 0; }
|
|
#ifdef __linux__
|
|
if (prefetch) { flags |= MAP_POPULATE; }
|
|
#endif
|
|
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
|
if (addr == MAP_FAILED) {
|
|
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
|
}
|
|
|
|
if (prefetch > 0) {
|
|
// Advise the kernel to preload the mapped memory
|
|
if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
|
|
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
|
strerror(errno));
|
|
}
|
|
}
|
|
if (numa) {
|
|
// advise the kernel not to use readahead
|
|
// (because the next page might not belong on the same node)
|
|
if (madvise(addr, file->size, MADV_RANDOM)) {
|
|
fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
|
|
strerror(errno));
|
|
}
|
|
}
|
|
}
|
|
|
|
~llama_mmap() {
|
|
munmap(addr, size);
|
|
}
|
|
#elif defined(_WIN32)
|
|
static constexpr bool SUPPORTED = true;
|
|
|
|
llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
|
|
(void) numa;
|
|
|
|
size = file->size;
|
|
|
|
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
|
|
|
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
|
DWORD error = GetLastError();
|
|
|
|
if (hMapping == NULL) {
|
|
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
|
|
}
|
|
|
|
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
|
error = GetLastError();
|
|
CloseHandle(hMapping);
|
|
|
|
if (addr == NULL) {
|
|
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
|
|
}
|
|
|
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
|
if (prefetch) {
|
|
// Advise the kernel to preload the mapped memory
|
|
WIN32_MEMORY_RANGE_ENTRY range;
|
|
range.VirtualAddress = addr;
|
|
range.NumberOfBytes = (SIZE_T)size;
|
|
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
|
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
|
llama_format_win_err(GetLastError()).c_str());
|
|
}
|
|
}
|
|
#else
|
|
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
|
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
|
}
|
|
|
|
~llama_mmap() {
|
|
if (!UnmapViewOfFile(addr)) {
|
|
fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
|
|
llama_format_win_err(GetLastError()).c_str());
|
|
}
|
|
}
|
|
#else
|
|
static constexpr bool SUPPORTED = false;
|
|
|
|
llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
|
|
(void) file;
|
|
(void) prefetch;
|
|
(void) numa;
|
|
|
|
throw std::runtime_error(std::string("mmap not supported"));
|
|
}
|
|
#endif
|
|
};
|
|
|
|
// Represents some region of memory being locked using mlock or VirtualLock;
|
|
// will automatically unlock on destruction.
|
|
struct llama_mlock {
|
|
void * addr = NULL;
|
|
size_t size = 0;
|
|
|
|
bool failed_already = false;
|
|
|
|
llama_mlock() {}
|
|
llama_mlock(const llama_mlock &) = delete;
|
|
|
|
~llama_mlock() {
|
|
if (size) {
|
|
raw_unlock(addr, size);
|
|
}
|
|
}
|
|
|
|
void init(void * ptr) {
|
|
GGML_ASSERT(addr == NULL && size == 0); // NOLINT
|
|
addr = ptr;
|
|
}
|
|
|
|
void grow_to(size_t target_size) {
|
|
GGML_ASSERT(addr);
|
|
if (failed_already) {
|
|
return;
|
|
}
|
|
size_t granularity = lock_granularity();
|
|
target_size = (target_size + granularity - 1) & ~(granularity - 1);
|
|
if (target_size > size) {
|
|
if (raw_lock((uint8_t *) addr + size, target_size - size)) {
|
|
size = target_size;
|
|
} else {
|
|
failed_already = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
#ifdef _POSIX_MEMLOCK_RANGE
|
|
static constexpr bool SUPPORTED = true;
|
|
|
|
static size_t lock_granularity() {
|
|
return (size_t) sysconf(_SC_PAGESIZE);
|
|
}
|
|
|
|
#ifdef __APPLE__
|
|
#define MLOCK_SUGGESTION \
|
|
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
|
|
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
|
|
#else
|
|
#define MLOCK_SUGGESTION \
|
|
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
|
|
#endif
|
|
|
|
bool raw_lock(const void * addr, size_t size) const {
|
|
if (!mlock(addr, size)) {
|
|
return true;
|
|
}
|
|
|
|
char* errmsg = std::strerror(errno);
|
|
bool suggest = (errno == ENOMEM);
|
|
|
|
// Check if the resource limit is fine after all
|
|
struct rlimit lock_limit;
|
|
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
|
|
suggest = false;
|
|
}
|
|
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
|
|
suggest = false;
|
|
}
|
|
|
|
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
|
|
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
|
|
return false;
|
|
}
|
|
|
|
#undef MLOCK_SUGGESTION
|
|
|
|
static void raw_unlock(void * addr, size_t size) {
|
|
if (munlock(addr, size)) {
|
|
fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
|
|
}
|
|
}
|
|
#elif defined(_WIN32)
|
|
static constexpr bool SUPPORTED = true;
|
|
|
|
static size_t lock_granularity() {
|
|
SYSTEM_INFO si;
|
|
GetSystemInfo(&si);
|
|
return (size_t) si.dwPageSize;
|
|
}
|
|
|
|
bool raw_lock(void * ptr, size_t len) const {
|
|
for (int tries = 1; ; tries++) {
|
|
if (VirtualLock(ptr, len)) {
|
|
return true;
|
|
}
|
|
if (tries == 2) {
|
|
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
|
len, size, llama_format_win_err(GetLastError()).c_str());
|
|
return false;
|
|
}
|
|
|
|
// It failed but this was only the first try; increase the working
|
|
// set size and try again.
|
|
SIZE_T min_ws_size, max_ws_size;
|
|
if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
|
|
fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
|
|
llama_format_win_err(GetLastError()).c_str());
|
|
return false;
|
|
}
|
|
// Per MSDN: "The maximum number of pages that a process can lock
|
|
// is equal to the number of pages in its minimum working set minus
|
|
// a small overhead."
|
|
// Hopefully a megabyte is enough overhead:
|
|
size_t increment = len + 1048576;
|
|
// The minimum must be <= the maximum, so we need to increase both:
|
|
min_ws_size += increment;
|
|
max_ws_size += increment;
|
|
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
|
|
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
|
|
llama_format_win_err(GetLastError()).c_str());
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
static void raw_unlock(void * ptr, size_t len) {
|
|
if (!VirtualUnlock(ptr, len)) {
|
|
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
|
|
llama_format_win_err(GetLastError()).c_str());
|
|
}
|
|
}
|
|
#else
|
|
static constexpr bool SUPPORTED = false;
|
|
|
|
static size_t lock_granularity() {
|
|
return (size_t) 65536;
|
|
}
|
|
|
|
bool raw_lock(const void * addr, size_t len) const {
|
|
fprintf(stderr, "warning: mlock not supported on this system\n");
|
|
return false;
|
|
}
|
|
|
|
static void raw_unlock(const void * addr, size_t len) {}
|
|
#endif
|
|
};
|
|
|
|
typedef void (*offload_func_t)(struct ggml_tensor * tensor);
|
|
|
|
static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
|
(void) tensor;
|
|
}
|
|
|
|
static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
|
std::vector<char> result(8, 0);
|
|
const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
|
|
if (n_tokens < 0) {
|
|
result.resize(-n_tokens);
|
|
int check = llama_token_to_piece(ctx, token, result.data(), result.size());
|
|
GGML_ASSERT(check == -n_tokens);
|
|
} else {
|
|
result.resize(n_tokens);
|
|
}
|
|
|
|
return std::string(result.data(), result.size());
|
|
}
|
|
|
|
//
|
|
// globals
|
|
//
|
|
|
|
struct llama_state {
|
|
// We save the log callback globally
|
|
llama_log_callback log_callback = llama_log_callback_default;
|
|
void * log_callback_user_data = nullptr;
|
|
};
|
|
|
|
static llama_state g_state;
|
|
|
|
// available llama models
|
|
enum e_model {
|
|
MODEL_UNKNOWN,
|
|
MODEL_3B,
|
|
MODEL_7B,
|
|
MODEL_13B,
|
|
MODEL_30B,
|
|
MODEL_34B,
|
|
MODEL_40B,
|
|
MODEL_65B,
|
|
MODEL_70B,
|
|
};
|
|
|
|
static const size_t kB = 1024;
|
|
static const size_t MB = kB*kB;
|
|
|
|
// default hparams (LLaMA 7B)
|
|
struct llama_hparams {
|
|
uint32_t n_vocab = 32000;
|
|
uint32_t n_ctx_train = 2048; // the context size used during training
|
|
uint32_t n_ctx = 512; // the context size used during inference
|
|
uint32_t n_embd = 4096;
|
|
uint32_t n_head = 32;
|
|
uint32_t n_head_kv = 32;
|
|
uint32_t n_layer = 32;
|
|
uint32_t n_rot = 64;
|
|
uint32_t n_ff = 11008;
|
|
|
|
float f_norm_eps = 1e-5;
|
|
float f_norm_rms_eps = 1e-5;
|
|
|
|
float rope_freq_base = 10000.0f;
|
|
float rope_freq_scale = 1.0f;
|
|
|
|
bool operator!=(const llama_hparams & other) const {
|
|
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
|
|
}
|
|
|
|
uint32_t n_gqa() const {
|
|
return n_head/n_head_kv;
|
|
}
|
|
|
|
uint32_t n_embd_head() const {
|
|
return n_embd/n_head;
|
|
}
|
|
|
|
uint32_t n_embd_gqa() const {
|
|
return n_embd/n_gqa();
|
|
}
|
|
|
|
size_t kv_size() const {
|
|
size_t result = 2ull;
|
|
result *= (size_t) n_embd_gqa();
|
|
result *= (size_t) n_ctx;
|
|
result *= (size_t) n_layer;
|
|
result *= sizeof(ggml_fp16_t);
|
|
return result;
|
|
}
|
|
};
|
|
|
|
struct llama_layer {
|
|
// normalization
|
|
struct ggml_tensor * attn_norm;
|
|
struct ggml_tensor * attn_norm_b;
|
|
struct ggml_tensor * attn_norm_2;
|
|
struct ggml_tensor * attn_norm_2_b;
|
|
|
|
// attention
|
|
struct ggml_tensor * wq;
|
|
struct ggml_tensor * wk;
|
|
struct ggml_tensor * wv;
|
|
struct ggml_tensor * wo;
|
|
struct ggml_tensor * wqkv;
|
|
|
|
// normalization
|
|
struct ggml_tensor * ffn_norm;
|
|
|
|
// ff
|
|
struct ggml_tensor * w1; // ffn_gate
|
|
struct ggml_tensor * w2; // ffn_down
|
|
struct ggml_tensor * w3; // ffn_up
|
|
};
|
|
|
|
struct llama_kv_cache {
|
|
struct ggml_tensor * k = NULL;
|
|
struct ggml_tensor * v = NULL;
|
|
|
|
struct ggml_context * ctx = NULL;
|
|
|
|
llama_buffer buf;
|
|
|
|
int n; // number of tokens currently in the cache
|
|
|
|
~llama_kv_cache() {
|
|
if (ctx) {
|
|
ggml_free(ctx);
|
|
}
|
|
|
|
#ifdef GGML_USE_CUBLAS
|
|
ggml_cuda_free_data(k);
|
|
ggml_cuda_free_data(v);
|
|
#endif // GGML_USE_CUBLAS
|
|
}
|
|
};
|
|
|
|
struct llama_vocab {
|
|
using id = int32_t;
|
|
using token = std::string;
|
|
using ttype = llama_token_type;
|
|
|
|
struct token_data {
|
|
token text;
|
|
float score;
|
|
ttype type;
|
|
};
|
|
|
|
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
|
|
|
std::unordered_map<token, id> token_to_id;
|
|
std::vector<token_data> id_to_token;
|
|
|
|
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
|
|
|
// default LLaMA special tokens
|
|
id special_bos_id = 1;
|
|
id special_eos_id = 2;
|
|
id special_unk_id = 0;
|
|
id special_sep_id = -1;
|
|
id special_pad_id = -1;
|
|
|
|
id linefeed_id = 13;
|
|
|
|
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
|
replace_all(token_left, " ", "\u0120");
|
|
replace_all(token_left, "\n", "\u010A");
|
|
replace_all(token_right, " ", "\u0120");
|
|
replace_all(token_right, "\n", "\u010A");
|
|
|
|
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
|
|
if (it == bpe_ranks.end()) {
|
|
return -1;
|
|
}
|
|
|
|
return it->second;
|
|
}
|
|
};
|
|
|
|
struct llama_model {
|
|
e_model type = MODEL_UNKNOWN;
|
|
llm_arch arch = LLM_ARCH_UNKNOWN;
|
|
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
|
|
|
|
std::string name = "n/a";
|
|
|
|
llama_hparams hparams;
|
|
llama_vocab vocab;
|
|
|
|
struct ggml_tensor * tok_embeddings;
|
|
|
|
struct ggml_tensor * output_norm;
|
|
struct ggml_tensor * output_norm_b;
|
|
struct ggml_tensor * output;
|
|
|
|
std::vector<llama_layer> layers;
|
|
|
|
int n_gpu_layers;
|
|
|
|
// context
|
|
struct ggml_context * ctx = NULL;
|
|
|
|
// the model memory buffer
|
|
llama_buffer buf;
|
|
|
|
// model memory mapped file
|
|
std::unique_ptr<llama_mmap> mapping;
|
|
|
|
// objects representing data potentially being locked in memory
|
|
llama_mlock mlock_buf;
|
|
llama_mlock mlock_mmap;
|
|
|
|
// for quantize-stats only
|
|
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
|
|
|
int64_t t_load_us = 0;
|
|
int64_t t_start_us = 0;
|
|
|
|
~llama_model() {
|
|
if (ctx) {
|
|
ggml_free(ctx);
|
|
}
|
|
|
|
#ifdef GGML_USE_CUBLAS
|
|
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
|
ggml_cuda_free_data(tensors_by_name[i].second);
|
|
}
|
|
ggml_cuda_free_scratch();
|
|
#elif defined(GGML_USE_CLBLAST)
|
|
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
|
ggml_cl_free_data(tensors_by_name[i].second);
|
|
}
|
|
#endif
|
|
}
|
|
};
|
|
|
|
struct llama_context {
|
|
llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
|
~llama_context() {
|
|
if (model_owner) {
|
|
delete &model;
|
|
}
|
|
#ifdef GGML_USE_METAL
|
|
if (ctx_metal) {
|
|
ggml_metal_free(ctx_metal);
|
|
}
|
|
#endif
|
|
if (alloc) {
|
|
ggml_allocr_free(alloc);
|
|
}
|
|
}
|
|
|
|
std::mt19937 rng;
|
|
|
|
bool has_evaluated_once = false;
|
|
|
|
int64_t t_sample_us = 0;
|
|
int64_t t_eval_us = 0;
|
|
int64_t t_p_eval_us = 0;
|
|
|
|
int32_t n_sample = 0; // number of tokens sampled
|
|
int32_t n_eval = 0; // number of eval calls
|
|
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
|
|
|
const llama_model & model;
|
|
|
|
bool model_owner = false;
|
|
|
|
int64_t t_load_us;
|
|
int64_t t_start_us;
|
|
|
|
// key + value cache for the self attention
|
|
struct llama_kv_cache kv_self;
|
|
|
|
// decode output (2-dimensional array: [n_tokens][n_vocab])
|
|
std::vector<float> logits;
|
|
bool logits_all = false;
|
|
|
|
// input embedding (1-dimensional array: [n_embd])
|
|
std::vector<float> embedding;
|
|
|
|
// reusable buffer for `struct ggml_graph_plan.work_data`
|
|
std::vector<uint8_t> work_buffer;
|
|
|
|
// memory buffers used to evaluate the model
|
|
llama_buffer buf_compute;
|
|
|
|
llama_buffer buf_alloc;
|
|
ggml_allocr * alloc = NULL;
|
|
|
|
#ifdef GGML_USE_METAL
|
|
ggml_metal_context * ctx_metal = NULL;
|
|
#endif
|
|
|
|
#ifdef GGML_USE_MPI
|
|
ggml_mpi_context * ctx_mpi = NULL;
|
|
#endif
|
|
};
|
|
|
|
//
|
|
// kv cache helpers
|
|
//
|
|
|
|
static bool llama_kv_cache_init(
|
|
const struct llama_hparams & hparams,
|
|
struct llama_kv_cache & cache,
|
|
ggml_type wtype,
|
|
int n_ctx,
|
|
int n_gpu_layers) {
|
|
const int n_embd = hparams.n_embd_gqa();
|
|
const int n_layer = hparams.n_layer;
|
|
|
|
const int64_t n_mem = n_layer*n_ctx;
|
|
const int64_t n_elements = n_embd*n_mem;
|
|
|
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
|
cache.n = 0;
|
|
|
|
struct ggml_init_params params;
|
|
params.mem_size = cache.buf.size;
|
|
params.mem_buffer = cache.buf.data;
|
|
params.no_alloc = false;
|
|
|
|
cache.ctx = ggml_init(params);
|
|
|
|
if (!cache.ctx) {
|
|
LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
|
|
return false;
|
|
}
|
|
|
|
cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
|
cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
|
ggml_set_name(cache.k, "cache_k");
|
|
ggml_set_name(cache.v, "cache_v");
|
|
|
|
(void) n_gpu_layers;
|
|
#ifdef GGML_USE_CUBLAS
|
|
if (n_gpu_layers > n_layer + 1) {
|
|
ggml_cuda_assign_buffers_no_scratch(cache.v);
|
|
}
|
|
if (n_gpu_layers > n_layer + 2) {
|
|
ggml_cuda_assign_buffers_no_scratch(cache.k);
|
|
}
|
|
#endif // GGML_USE_CUBLAS
|
|
|
|
return true;
|
|
}
|
|
|
|
//
|
|
// model loading and saving
|
|
//
|
|
|
|
enum llama_fver {
|
|
GGUF_FILE_VERSION_V1 = 1,
|
|
GGUF_FILE_VERSION_V2 = 2,
|
|
};
|
|
|
|
static const char * llama_file_version_name(llama_fver version) {
|
|
switch (version) {
|
|
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
|
|
case GGUF_FILE_VERSION_V2: return "GGUF V2 (latest)";
|
|
}
|
|
|
|
return "unknown";
|
|
}
|
|
|
|
static std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
|
|
char buf[256];
|
|
snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0));
|
|
for (size_t i = 1; i < ne.size(); i++) {
|
|
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i));
|
|
}
|
|
return buf;
|
|
}
|
|
|
|
static std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
|
|
char buf[256];
|
|
snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]);
|
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]);
|
|
}
|
|
return buf;
|
|
}
|
|
|
|
struct llama_model_loader {
|
|
int n_kv = 0;
|
|
int n_tensors = 0;
|
|
int n_created = 0;
|
|
|
|
int64_t n_elements = 0;
|
|
|
|
bool use_mmap = false;
|
|
|
|
llama_file file;
|
|
llama_ftype ftype;
|
|
llama_fver fver;
|
|
|
|
std::unique_ptr<llama_mmap> mapping;
|
|
|
|
struct gguf_context * ctx_gguf = NULL;
|
|
struct ggml_context * ctx_meta = NULL;
|
|
|
|
llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") {
|
|
struct gguf_init_params params = {
|
|
/*.no_alloc = */ true,
|
|
/*.ctx = */ &ctx_meta,
|
|
};
|
|
|
|
ctx_gguf = gguf_init_from_file(fname.c_str(), params);
|
|
if (!ctx_gguf) {
|
|
throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
|
|
}
|
|
|
|
n_kv = gguf_get_n_kv(ctx_gguf);
|
|
n_tensors = gguf_get_n_tensors(ctx_gguf);
|
|
|
|
fver = (enum llama_fver ) gguf_get_version(ctx_gguf);
|
|
|
|
for (int i = 0; i < n_tensors; i++) {
|
|
const char * name = gguf_get_tensor_name(ctx_gguf, i);
|
|
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
|
|
n_elements += ggml_nelements(t);
|
|
}
|
|
|
|
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
|
__func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
|
|
|
|
// determine file type based on the number of tensors for each quantization and print meta data
|
|
// TODO: make optional
|
|
{
|
|
std::map<enum ggml_type, uint32_t> n_type;
|
|
|
|
uint32_t n_type_max = 0;
|
|
enum ggml_type type_max = GGML_TYPE_F32;
|
|
|
|
for (int i = 0; i < n_tensors; i++) {
|
|
const char * name = gguf_get_tensor_name(ctx_gguf, i);
|
|
struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, name);
|
|
|
|
n_type[meta->type]++;
|
|
|
|
if (n_type_max < n_type[meta->type]) {
|
|
n_type_max = n_type[meta->type];
|
|
type_max = meta->type;
|
|
}
|
|
|
|
LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str());
|
|
}
|
|
|
|
switch (type_max) {
|
|
case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
|
|
case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
|
|
case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
|
|
case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
|
|
case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
|
|
case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break;
|
|
case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break;
|
|
case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break;
|
|
case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break;
|
|
case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
|
|
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
|
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
|
default:
|
|
{
|
|
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
|
ftype = LLAMA_FTYPE_ALL_F32;
|
|
} break;
|
|
}
|
|
|
|
// this is a way to mark that we have "guessed" the file type
|
|
ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
|
|
|
|
{
|
|
const int kid = gguf_find_key(ctx_gguf, "general.file_type");
|
|
if (kid >= 0) {
|
|
ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid);
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < n_kv; i++) {
|
|
const char * name = gguf_get_key(ctx_gguf, i);
|
|
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
|
|
|
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-8s\n", __func__, i, name, gguf_type_name(type));
|
|
}
|
|
|
|
// print type counts
|
|
for (auto & kv : n_type) {
|
|
if (kv.second == 0) {
|
|
continue;
|
|
}
|
|
|
|
LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
|
|
}
|
|
}
|
|
|
|
if (!llama_mmap::SUPPORTED) {
|
|
LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__);
|
|
use_mmap = false;
|
|
}
|
|
|
|
this->use_mmap = use_mmap;
|
|
}
|
|
|
|
~llama_model_loader() {
|
|
if (ctx_gguf) {
|
|
gguf_free(ctx_gguf);
|
|
}
|
|
if (ctx_meta) {
|
|
ggml_free(ctx_meta);
|
|
}
|
|
}
|
|
|
|
std::string get_arch_name() const {
|
|
const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
|
|
|
std::string arch_name;
|
|
GGUF_GET_KEY(ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_ARCHITECTURE));
|
|
|
|
return arch_name;
|
|
}
|
|
|
|
enum llm_arch get_arch() const {
|
|
const std::string arch_name = get_arch_name();
|
|
|
|
return llm_arch_from_string(arch_name);
|
|
}
|
|
|
|
const char * get_tensor_name(int i) const {
|
|
return gguf_get_tensor_name(ctx_gguf, i);
|
|
}
|
|
|
|
struct ggml_tensor * get_tensor_meta(int i) const {
|
|
return ggml_get_tensor(ctx_meta, get_tensor_name(i));
|
|
}
|
|
|
|
void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const {
|
|
ctx_size_p = 0;
|
|
mmapped_size_p = 0;
|
|
|
|
for (int i = 0; i < n_tensors; i++) {
|
|
struct ggml_tensor * meta = get_tensor_meta(i);
|
|
ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
|
|
(use_mmap ? mmapped_size_p : ctx_size_p) += ggml_nbytes_pad(meta);
|
|
}
|
|
}
|
|
|
|
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend backend) {
|
|
if (backend != GGML_BACKEND_CPU) {
|
|
ggml_set_no_alloc(ctx, true);
|
|
}
|
|
|
|
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
|
|
tensor->backend = backend; // TODO: ggml_set_backend
|
|
ggml_set_name(tensor, ggml_get_name(meta));
|
|
|
|
if (backend != GGML_BACKEND_CPU) {
|
|
ggml_set_no_alloc(ctx, use_mmap);
|
|
}
|
|
|
|
n_created++;
|
|
|
|
return tensor;
|
|
}
|
|
|
|
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend backend) {
|
|
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
|
|
|
|
if (cur == NULL) {
|
|
throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
|
|
}
|
|
|
|
{
|
|
bool is_ok = true;
|
|
for (size_t i = 0; i < ne.size(); ++i) {
|
|
if (ne[i] != cur->ne[i]) {
|
|
is_ok = false;
|
|
break;
|
|
}
|
|
}
|
|
if (!is_ok) {
|
|
throw std::runtime_error(
|
|
format("%s: tensor '%s' has wrong shape; expected %s, got %s",
|
|
__func__, name.c_str(),
|
|
llama_format_tensor_shape(ne).c_str(),
|
|
llama_format_tensor_shape(cur).c_str()));
|
|
}
|
|
}
|
|
|
|
return create_tensor_for(ctx, cur, backend);
|
|
}
|
|
|
|
void done_getting_tensors() const {
|
|
if (n_created != n_tensors) {
|
|
throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
|
|
}
|
|
}
|
|
|
|
size_t file_offset(const char * name) const {
|
|
const int idx = gguf_find_tensor(ctx_gguf, name);
|
|
|
|
if (idx < 0) {
|
|
throw std::runtime_error(format("%s: tensor '%s' not found in the file", __func__, name));
|
|
}
|
|
|
|
return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
|
|
}
|
|
|
|
void load_data_for(struct ggml_tensor * cur) const {
|
|
const size_t offs = file_offset(ggml_get_name(cur));
|
|
|
|
if (use_mmap) {
|
|
cur->data = (uint8_t *) mapping->addr + offs;
|
|
} else {
|
|
file.seek(offs, SEEK_SET);
|
|
file.read_raw(cur->data, ggml_nbytes(cur));
|
|
}
|
|
}
|
|
|
|
void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
|
size_t size_data = 0;
|
|
size_t size_lock = 0;
|
|
size_t size_pref = 0; // prefetch
|
|
|
|
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
|
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
|
size_data += ggml_nbytes(cur);
|
|
if (cur->backend == GGML_BACKEND_CPU) {
|
|
size_pref += ggml_nbytes(cur);
|
|
}
|
|
}
|
|
|
|
if (use_mmap) {
|
|
mapping.reset(new llama_mmap(&file, size_pref, ggml_is_numa()));
|
|
if (lmlock) {
|
|
lmlock->init(mapping->addr);
|
|
}
|
|
}
|
|
|
|
size_t done_size = 0;
|
|
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
|
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
|
GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
|
|
|
|
if (progress_callback) {
|
|
progress_callback((float) done_size / size_data, progress_callback_user_data);
|
|
}
|
|
|
|
// allocate temp buffer if not using mmap
|
|
if (!use_mmap && cur->data == NULL) {
|
|
GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
|
|
cur->data = malloc(ggml_nbytes(cur));
|
|
}
|
|
|
|
load_data_for(cur);
|
|
|
|
switch (cur->backend) {
|
|
case GGML_BACKEND_CPU:
|
|
if (use_mmap && lmlock) {
|
|
size_lock += ggml_nbytes(cur);
|
|
lmlock->grow_to(size_lock);
|
|
}
|
|
break;
|
|
#if defined(GGML_USE_CUBLAS)
|
|
case GGML_BACKEND_GPU:
|
|
case GGML_BACKEND_GPU_SPLIT:
|
|
// old code:
|
|
//ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
|
|
|
|
// TODO: test if this works !!
|
|
ggml_cuda_transform_tensor(cur->data, cur);
|
|
if (!use_mmap) {
|
|
free(cur->data);
|
|
}
|
|
break;
|
|
#elif defined(GGML_USE_CLBLAST)
|
|
case GGML_BACKEND_GPU:
|
|
ggml_cl_transform_tensor(cur->data, cur);
|
|
if (!use_mmap) {
|
|
free(cur->data);
|
|
}
|
|
break;
|
|
#endif
|
|
default:
|
|
continue;
|
|
}
|
|
|
|
done_size += ggml_nbytes(cur);
|
|
}
|
|
}
|
|
};
|
|
|
|
//
|
|
// load LLaMA models
|
|
//
|
|
|
|
std::string llama_model_ftype_name(enum llama_ftype ftype) {
|
|
if (ftype & LLAMA_FTYPE_GUESSED) {
|
|
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
|
|
}
|
|
|
|
switch (ftype) {
|
|
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
|
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
|
|
case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
|
|
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
|
return "mostly Q4_1, some F16";
|
|
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
|
|
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
|
|
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
|
|
|
|
// K-quants
|
|
case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K";
|
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
|
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
|
|
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
|
|
case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "mostly Q4_K - Small";
|
|
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
|
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
|
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
|
|
case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
|
|
|
|
default: return "unknown, may not work";
|
|
}
|
|
}
|
|
|
|
static const char * llama_model_type_name(e_model type) {
|
|
switch (type) {
|
|
case MODEL_3B: return "3B";
|
|
case MODEL_7B: return "7B";
|
|
case MODEL_13B: return "13B";
|
|
case MODEL_30B: return "30B";
|
|
case MODEL_34B: return "34B";
|
|
case MODEL_40B: return "40B";
|
|
case MODEL_65B: return "65B";
|
|
case MODEL_70B: return "70B";
|
|
default: return "?B";
|
|
}
|
|
}
|
|
|
|
static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
|
model.arch = ml.get_arch();
|
|
if (model.arch == LLM_ARCH_UNKNOWN) {
|
|
throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
|
|
}
|
|
}
|
|
|
|
static void llm_load_hparams(
|
|
llama_model_loader & ml,
|
|
llama_model & model,
|
|
int n_ctx,
|
|
float rope_freq_base,
|
|
float rope_freq_scale) {
|
|
struct gguf_context * ctx = ml.ctx_gguf;
|
|
|
|
const auto kv = LLM_KV(model.arch);
|
|
|
|
auto & hparams = model.hparams;
|
|
|
|
// get general kv
|
|
GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
|
|
|
|
// get hparams kv
|
|
GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST));
|
|
GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
|
|
GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
|
|
GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
|
|
GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
|
|
GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
|
|
|
|
// n_head_kv is optional, default to n_head
|
|
hparams.n_head_kv = hparams.n_head;
|
|
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
|
|
|
|
// TODO: manually setting rope freq base and scale should override this
|
|
// FIXME: partial fix when the param specified is not the default value, but
|
|
// will not work for overriding the model value to the params default
|
|
|
|
llama_context_params defaults = llama_context_default_params();
|
|
|
|
// rope_freq_base
|
|
{
|
|
float ropebase = 10000.0f;
|
|
GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
|
if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) {
|
|
rope_freq_base = ropebase;
|
|
}
|
|
}
|
|
|
|
// rope_freq_scale (inverse of the kv) is optional
|
|
{
|
|
float ropescale = 1.0f;
|
|
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
|
|
if (ropescale != 1.0f && rope_freq_scale == defaults.rope_freq_scale) {
|
|
rope_freq_scale = 1.0f/ropescale;
|
|
}
|
|
}
|
|
|
|
// sanity check for n_rot (optional)
|
|
{
|
|
hparams.n_rot = hparams.n_embd / hparams.n_head;
|
|
|
|
GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
|
|
|
|
if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
|
|
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
|
|
}
|
|
}
|
|
|
|
// arch-specific KVs
|
|
switch (model.arch) {
|
|
case LLM_ARCH_LLAMA:
|
|
{
|
|
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
|
|
|
switch (hparams.n_layer) {
|
|
case 26: model.type = e_model::MODEL_3B; break;
|
|
case 32: model.type = e_model::MODEL_7B; break;
|
|
case 40: model.type = e_model::MODEL_13B; break;
|
|
case 48: model.type = e_model::MODEL_34B; break;
|
|
case 60: model.type = e_model::MODEL_30B; break;
|
|
case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_FALCON:
|
|
{
|
|
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
|
|
|
switch (hparams.n_layer) {
|
|
case 32: model.type = e_model::MODEL_7B; break;
|
|
case 60: model.type = e_model::MODEL_40B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
default: (void)0;
|
|
};
|
|
|
|
model.ftype = ml.ftype;
|
|
|
|
hparams.n_ctx = n_ctx;
|
|
hparams.rope_freq_base = rope_freq_base;
|
|
hparams.rope_freq_scale = rope_freq_scale;
|
|
}
|
|
|
|
// TODO: This should probably be in llama.h
|
|
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
|
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
|
|
|
|
static void llm_load_vocab(
|
|
llama_model_loader & ml,
|
|
llama_model & model) {
|
|
auto & vocab = model.vocab;
|
|
|
|
struct gguf_context * ctx = ml.ctx_gguf;
|
|
|
|
const auto kv = LLM_KV(model.arch);
|
|
|
|
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
|
|
if (token_idx == -1) {
|
|
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
|
}
|
|
|
|
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
|
if (score_idx == -1) {
|
|
throw std::runtime_error("cannot find tokenizer scores in model file\n");
|
|
}
|
|
|
|
const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
|
|
|
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
|
if (toktype_idx == -1) {
|
|
throw std::runtime_error("cannot find token type list in GGUF file\n");
|
|
}
|
|
|
|
const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
|
|
|
// determine vocab type
|
|
{
|
|
std::string tokenizer_name;
|
|
|
|
GGUF_GET_KEY(ctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, kv(LLM_KV_TOKENIZER_MODEL));
|
|
|
|
if (tokenizer_name == "llama") {
|
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
|
|
|
// default special tokens
|
|
vocab.special_bos_id = 1;
|
|
vocab.special_eos_id = 2;
|
|
vocab.special_unk_id = 0;
|
|
vocab.special_sep_id = -1;
|
|
vocab.special_pad_id = -1;
|
|
} else if (tokenizer_name == "gpt2") {
|
|
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
|
|
|
// read bpe merges and populate bpe ranks
|
|
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
|
|
if (merges_keyidx == -1) {
|
|
throw std::runtime_error("cannot find tokenizer merges in model file\n");
|
|
}
|
|
|
|
const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
|
|
|
|
for (int i = 0; i < n_merges; i++) {
|
|
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
|
|
|
|
std::string first;
|
|
std::string second;
|
|
|
|
const size_t pos = word.find(' ', 1);
|
|
|
|
if (pos != std::string::npos) {
|
|
first = word.substr(0, pos);
|
|
second = word.substr(pos + 1);
|
|
}
|
|
|
|
vocab.bpe_ranks.emplace(std::make_pair(first, second), i);
|
|
}
|
|
|
|
// default special tokens
|
|
vocab.special_bos_id = 11;
|
|
vocab.special_eos_id = 11;
|
|
vocab.special_unk_id = -1;
|
|
vocab.special_sep_id = -1;
|
|
vocab.special_pad_id = -1;
|
|
} else {
|
|
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
|
|
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
|
|
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
|
}
|
|
}
|
|
|
|
const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
|
|
|
|
vocab.id_to_token.resize(n_vocab);
|
|
|
|
for (uint32_t i = 0; i < n_vocab; i++) {
|
|
std::string word = gguf_get_arr_str(ctx, token_idx, i);
|
|
|
|
vocab.token_to_id[word] = i;
|
|
|
|
auto & token_data = vocab.id_to_token[i];
|
|
token_data.text = std::move(word);
|
|
token_data.score = scores[i];
|
|
token_data.type = (llama_token_type) toktypes[i];
|
|
}
|
|
|
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
|
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
|
} else {
|
|
vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
|
|
}
|
|
|
|
// special tokens
|
|
GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
|
|
GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_EOS_ID));
|
|
GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID));
|
|
GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID));
|
|
GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID));
|
|
}
|
|
|
|
static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
const auto & hparams = model.hparams;
|
|
const auto & vocab = model.vocab;
|
|
|
|
// hparams
|
|
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
|
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
|
|
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
|
|
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
|
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
|
|
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
|
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
|
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
|
|
LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
|
|
LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
|
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
|
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
|
|
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
|
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
|
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
|
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
|
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
|
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
|
LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml.n_elements*1e-9);
|
|
|
|
// general kv
|
|
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
|
|
|
// special tokens
|
|
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
|
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
|
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
|
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
|
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
|
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
|
}
|
|
|
|
static void llm_load_tensors(
|
|
llama_model_loader & ml,
|
|
llama_model & model,
|
|
int n_batch,
|
|
int n_gpu_layers,
|
|
int main_gpu,
|
|
const float * tensor_split,
|
|
const bool mul_mat_q,
|
|
bool low_vram,
|
|
ggml_type memory_type,
|
|
bool use_mlock,
|
|
llama_progress_callback progress_callback,
|
|
void * progress_callback_user_data) {
|
|
model.t_start_us = ggml_time_us();
|
|
|
|
auto & ctx = model.ctx;
|
|
auto & hparams = model.hparams;
|
|
|
|
model.n_gpu_layers = n_gpu_layers;
|
|
|
|
size_t ctx_size;
|
|
size_t mmapped_size;
|
|
|
|
ml.calc_sizes(ctx_size, mmapped_size);
|
|
|
|
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
|
|
|
|
// create the ggml context
|
|
{
|
|
model.buf.resize(ctx_size);
|
|
if (use_mlock) {
|
|
model.mlock_buf.init (model.buf.data);
|
|
model.mlock_buf.grow_to(model.buf.size);
|
|
}
|
|
|
|
struct ggml_init_params params = {
|
|
/*.mem_size =*/ model.buf.size,
|
|
/*.mem_buffer =*/ model.buf.data,
|
|
/*.no_alloc =*/ ml.use_mmap,
|
|
};
|
|
|
|
model.ctx = ggml_init(params);
|
|
if (!model.ctx) {
|
|
throw std::runtime_error(format("ggml_init() failed"));
|
|
}
|
|
}
|
|
|
|
(void) main_gpu;
|
|
(void) mul_mat_q;
|
|
#if defined(GGML_USE_CUBLAS)
|
|
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
|
|
ggml_cuda_set_main_device(main_gpu);
|
|
ggml_cuda_set_mul_mat_q(mul_mat_q);
|
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
|
#elif defined(GGML_USE_CLBLAST)
|
|
LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
|
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
|
|
#else
|
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
|
|
#endif
|
|
|
|
// prepare memory for the weights
|
|
size_t vram_weights = 0;
|
|
{
|
|
const int64_t n_embd = hparams.n_embd;
|
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
|
const int64_t n_layer = hparams.n_layer;
|
|
const int64_t n_vocab = hparams.n_vocab;
|
|
|
|
const auto tn = LLM_TN(model.arch);
|
|
|
|
switch (model.arch) {
|
|
case LLM_ARCH_LLAMA:
|
|
{
|
|
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
|
|
|
// output
|
|
{
|
|
ggml_backend backend_norm;
|
|
ggml_backend backend_output;
|
|
|
|
if (n_gpu_layers > int(n_layer)) {
|
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
|
// on Windows however this is detrimental unless everything is on the GPU
|
|
#ifndef _WIN32
|
|
backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
|
#else
|
|
backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
|
#endif // _WIN32
|
|
|
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
|
} else {
|
|
backend_norm = GGML_BACKEND_CPU;
|
|
backend_output = GGML_BACKEND_CPU;
|
|
}
|
|
|
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
|
|
|
if (backend_norm == GGML_BACKEND_GPU) {
|
|
vram_weights += ggml_nbytes(model.output_norm);
|
|
}
|
|
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
|
vram_weights += ggml_nbytes(model.output);
|
|
}
|
|
}
|
|
|
|
const uint32_t n_ff = hparams.n_ff;
|
|
|
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
|
|
|
model.layers.resize(n_layer);
|
|
|
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
|
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
|
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
|
|
|
auto & layer = model.layers[i];
|
|
|
|
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
|
|
|
layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
|
|
layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
|
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
|
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
|
|
|
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
|
|
|
layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
|
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
|
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
|
|
|
if (backend == GGML_BACKEND_GPU) {
|
|
vram_weights +=
|
|
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
|
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
|
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
|
}
|
|
}
|
|
} break;
|
|
case LLM_ARCH_FALCON:
|
|
{
|
|
// TODO: CPU-only for now
|
|
|
|
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
|
|
|
// output
|
|
{
|
|
ggml_backend backend_norm;
|
|
ggml_backend backend_output;
|
|
|
|
if (n_gpu_layers > int(n_layer)) {
|
|
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
|
// on Windows however this is detrimental unless everything is on the GPU
|
|
#ifndef _WIN32
|
|
backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
|
#else
|
|
backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
|
#endif // _WIN32
|
|
|
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
|
} else {
|
|
backend_norm = GGML_BACKEND_CPU;
|
|
backend_output = GGML_BACKEND_CPU;
|
|
}
|
|
|
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
|
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
|
|
|
if (backend_norm == GGML_BACKEND_GPU) {
|
|
vram_weights += ggml_nbytes(model.output_norm);
|
|
vram_weights += ggml_nbytes(model.output_norm_b);
|
|
}
|
|
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
|
vram_weights += ggml_nbytes(model.output);
|
|
}
|
|
}
|
|
|
|
const uint32_t n_ff = hparams.n_ff;
|
|
|
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
|
|
|
model.layers.resize(n_layer);
|
|
|
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
|
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
|
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
|
|
|
auto & layer = model.layers[i];
|
|
|
|
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
|
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
|
|
|
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
|
|
layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend);
|
|
layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend);
|
|
|
|
if (backend == GGML_BACKEND_GPU) {
|
|
vram_weights += ggml_nbytes(layer.attn_norm_2);
|
|
vram_weights += ggml_nbytes(layer.attn_norm_2_b);
|
|
}
|
|
}
|
|
|
|
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
|
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
|
|
|
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
|
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
|
|
|
if (backend == GGML_BACKEND_GPU) {
|
|
vram_weights +=
|
|
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
|
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) +
|
|
ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
|
}
|
|
}
|
|
} break;
|
|
default:
|
|
throw std::runtime_error("unknown architecture");
|
|
};
|
|
}
|
|
|
|
ml.done_getting_tensors();
|
|
|
|
// print memory requirements
|
|
{
|
|
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
|
|
|
// this is the total memory required to run the inference
|
|
size_t mem_required =
|
|
ctx_size +
|
|
mmapped_size - vram_weights; // weights in VRAM not in memory
|
|
|
|
// this is the memory required by one llama_state
|
|
const size_t mem_required_state = scale*hparams.kv_size();
|
|
|
|
LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
|
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
|
|
|
(void) n_batch;
|
|
|
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
|
|
|
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
|
if (n_gpu_layers > (int) hparams.n_layer) {
|
|
LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
|
|
}
|
|
size_t vram_kv_cache = 0;
|
|
|
|
#ifdef GGML_USE_CUBLAS
|
|
const int max_backend_supported_layers = hparams.n_layer + 3;
|
|
#if defined(GGML_USE_HIPBLAS)
|
|
const int max_offloadable_layers = low_vram ? hparams.n_layer + 3 : hparams.n_layer + 3;
|
|
#else
|
|
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
|
#endif
|
|
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
|
if (low_vram) {
|
|
LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
|
} else {
|
|
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
|
|
vram_kv_cache += hparams.kv_size() / 2;
|
|
}
|
|
}
|
|
if (n_gpu_layers > (int) hparams.n_layer + 2) {
|
|
if (low_vram) {
|
|
LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
|
|
} else {
|
|
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
|
|
vram_kv_cache += hparams.kv_size() / 2;
|
|
}
|
|
}
|
|
#elif defined(GGML_USE_CLBLAST)
|
|
const int max_backend_supported_layers = hparams.n_layer + 1;
|
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
|
#endif // GGML_USE_CUBLAS
|
|
|
|
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
|
|
__func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
|
LLAMA_LOG_INFO("%s: VRAM used: %zu MB\n",
|
|
__func__, (vram_weights + vram_kv_cache + MB - 1) / MB); // round up
|
|
#else
|
|
(void) n_gpu_layers;
|
|
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
|
}
|
|
|
|
// populate `tensors_by_name`
|
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
|
struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
|
|
model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
|
|
}
|
|
|
|
(void) tensor_split;
|
|
#if defined(GGML_USE_CUBLAS)
|
|
{
|
|
ggml_cuda_set_tensor_split(tensor_split);
|
|
}
|
|
#endif
|
|
|
|
ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
|
|
|
|
if (progress_callback) {
|
|
progress_callback(1.0f, progress_callback_user_data);
|
|
}
|
|
|
|
model.mapping = std::move(ml.mapping);
|
|
|
|
// loading time will be recalculate after the first eval, so
|
|
// we take page faults deferred by mmap() into consideration
|
|
model.t_load_us = ggml_time_us() - model.t_start_us;
|
|
}
|
|
|
|
static bool llama_model_load(
|
|
const std::string & fname,
|
|
llama_model & model,
|
|
int n_ctx,
|
|
int n_batch,
|
|
int n_gpu_layers,
|
|
int main_gpu,
|
|
const float * tensor_split,
|
|
const bool mul_mat_q,
|
|
float rope_freq_base,
|
|
float rope_freq_scale,
|
|
bool low_vram,
|
|
ggml_type memory_type,
|
|
bool use_mmap,
|
|
bool use_mlock,
|
|
bool vocab_only,
|
|
llama_progress_callback progress_callback,
|
|
void *progress_callback_user_data) {
|
|
try {
|
|
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
|
|
|
|
llm_load_arch (*ml, model);
|
|
llm_load_hparams(*ml, model, n_ctx, rope_freq_base, rope_freq_scale);
|
|
llm_load_vocab (*ml, model);
|
|
|
|
llm_load_print_meta(*ml, model);
|
|
|
|
if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
|
throw std::runtime_error("vocab size mismatch");
|
|
}
|
|
|
|
if (vocab_only) {
|
|
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
|
|
return true;
|
|
}
|
|
|
|
llm_load_tensors(
|
|
*ml, model, n_batch, n_gpu_layers,
|
|
main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
|
|
use_mlock, progress_callback, progress_callback_user_data);
|
|
} catch (const std::exception & err) {
|
|
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static struct ggml_cgraph * llm_build_llama(
|
|
llama_context & lctx,
|
|
const llama_token * tokens,
|
|
const float * embd,
|
|
int n_tokens,
|
|
int n_past) {
|
|
|
|
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
|
|
|
const int N = n_tokens;
|
|
|
|
const auto & model = lctx.model;
|
|
const auto & hparams = model.hparams;
|
|
|
|
const auto & kv_self = lctx.kv_self;
|
|
|
|
GGML_ASSERT(!!kv_self.ctx);
|
|
|
|
const int64_t n_embd = hparams.n_embd;
|
|
const int64_t n_layer = hparams.n_layer;
|
|
const int64_t n_ctx = hparams.n_ctx;
|
|
const int64_t n_head = hparams.n_head;
|
|
const int64_t n_head_kv = hparams.n_head_kv;
|
|
const int64_t n_embd_head = hparams.n_embd_head();
|
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
|
|
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
|
|
const float freq_base = hparams.rope_freq_base;
|
|
const float freq_scale = hparams.rope_freq_scale;
|
|
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
|
|
|
const int n_gpu_layers = model.n_gpu_layers;
|
|
|
|
auto & buf_compute = lctx.buf_compute;
|
|
|
|
struct ggml_init_params params = {
|
|
/*.mem_size =*/ buf_compute.size,
|
|
/*.mem_buffer =*/ buf_compute.data,
|
|
/*.no_alloc =*/ false,
|
|
};
|
|
|
|
params.no_alloc = true;
|
|
|
|
struct ggml_context * ctx0 = ggml_init(params);
|
|
|
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
|
|
|
struct ggml_tensor * cur;
|
|
struct ggml_tensor * inpL;
|
|
|
|
if (tokens) {
|
|
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
|
|
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
|
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
|
}
|
|
ggml_set_name(inp_tokens, "inp_tokens");
|
|
|
|
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
|
} else {
|
|
#ifdef GGML_USE_MPI
|
|
GGML_ASSERT(false && "not implemented");
|
|
#endif
|
|
|
|
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
|
|
|
ggml_allocr_alloc(lctx.alloc, inpL);
|
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
|
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
|
}
|
|
}
|
|
|
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
|
(void) i_gpu_start;
|
|
|
|
// offload functions set the tensor output backend to GPU
|
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
|
//
|
|
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
|
// in that case ggml_cuda_assign_buffers has no effect
|
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
|
offload_func_t offload_func_kq = llama_nop;
|
|
offload_func_t offload_func_v = llama_nop;
|
|
|
|
#ifdef GGML_USE_CUBLAS
|
|
if (n_gpu_layers > n_layer) {
|
|
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
|
}
|
|
if (n_gpu_layers > n_layer + 1) {
|
|
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
|
}
|
|
if (n_gpu_layers > n_layer + 2) {
|
|
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
|
}
|
|
#endif // GGML_USE_CUBLAS
|
|
|
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
|
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
|
}
|
|
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
|
|
|
for (int il = 0; il < n_layer; ++il) {
|
|
ggml_format_name(inpL, "layer_inp_%d", il);
|
|
|
|
offload_func_t offload_func = llama_nop;
|
|
|
|
#ifdef GGML_USE_CUBLAS
|
|
if (il >= i_gpu_start) {
|
|
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
|
}
|
|
#endif // GGML_USE_CUBLAS
|
|
|
|
struct ggml_tensor * inpSA = inpL;
|
|
|
|
// norm
|
|
{
|
|
cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
|
|
offload_func(cur);
|
|
ggml_set_name(cur, "rms_norm_0");
|
|
|
|
// cur = cur*attn_norm(broadcasted)
|
|
cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
|
|
offload_func(cur);
|
|
ggml_set_name(cur, "attention_norm_0");
|
|
}
|
|
|
|
// self-attention
|
|
{
|
|
// compute Q and K and RoPE them
|
|
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
|
offload_func_kq(tmpk);
|
|
ggml_set_name(tmpk, "tmpk");
|
|
|
|
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
|
offload_func_kq(tmpq);
|
|
ggml_set_name(tmpq, "tmpq");
|
|
|
|
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
|
offload_func_kq(Kcur);
|
|
ggml_set_name(Kcur, "Kcur");
|
|
|
|
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
|
offload_func_kq(Qcur);
|
|
ggml_set_name(Qcur, "Qcur");
|
|
|
|
// store key and value to memory
|
|
{
|
|
// compute the transposed [N, n_embd] V matrix
|
|
|
|
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
|
offload_func_v(tmpv);
|
|
ggml_set_name(tmpv, "tmpv");
|
|
|
|
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
|
|
offload_func_v(Vcur);
|
|
ggml_set_name(Vcur, "Vcur");
|
|
|
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
|
|
offload_func_kq(k);
|
|
ggml_set_name(k, "k");
|
|
|
|
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
|
|
( n_ctx)*ggml_element_size(kv_self.v),
|
|
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
|
|
offload_func_v(v);
|
|
ggml_set_name(v, "v");
|
|
|
|
// important: storing RoPE-ed version of K in the KV cache!
|
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
|
}
|
|
|
|
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
|
offload_func_kq(Q);
|
|
ggml_set_name(Q, "Q");
|
|
|
|
struct ggml_tensor * K =
|
|
ggml_view_3d(ctx0, kv_self.k,
|
|
n_embd_head, n_past + N, n_head_kv,
|
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
|
ggml_element_size(kv_self.k)*n_embd_head,
|
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
|
offload_func_kq(K);
|
|
ggml_set_name(K, "K");
|
|
|
|
// K * Q
|
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
|
offload_func_kq(KQ);
|
|
ggml_set_name(KQ, "KQ");
|
|
|
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
|
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
|
offload_func_kq(KQ_scaled);
|
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
|
|
|
// KQ_masked = mask_past(KQ_scaled)
|
|
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
|
offload_func_kq(KQ_masked);
|
|
ggml_set_name(KQ_masked, "KQ_masked");
|
|
|
|
// KQ = soft_max(KQ_masked)
|
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
|
offload_func_v(KQ_soft_max);
|
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
|
|
|
// split cached V into n_head heads
|
|
struct ggml_tensor * V =
|
|
ggml_view_3d(ctx0, kv_self.v,
|
|
n_past + N, n_embd_head, n_head_kv,
|
|
ggml_element_size(kv_self.v)*n_ctx,
|
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
|
offload_func_v(V);
|
|
ggml_set_name(V, "V");
|
|
|
|
#if 1
|
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
|
offload_func_v(KQV);
|
|
ggml_set_name(KQV, "KQV");
|
|
#else
|
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
|
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
|
// is there a better way?
|
|
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
|
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
|
#endif
|
|
|
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
|
offload_func_v(KQV_merged);
|
|
ggml_set_name(KQV_merged, "KQV_merged");
|
|
|
|
// cur = KQV_merged.contiguous().view(n_embd, N)
|
|
cur = ggml_cpy(ctx0,
|
|
KQV_merged,
|
|
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
|
offload_func_v(cur);
|
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
|
|
|
// projection (no bias)
|
|
cur = ggml_mul_mat(ctx0,
|
|
model.layers[il].wo,
|
|
cur);
|
|
offload_func(cur);
|
|
ggml_set_name(cur, "result_wo");
|
|
}
|
|
|
|
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
|
offload_func(inpFF);
|
|
ggml_set_name(inpFF, "inpFF");
|
|
|
|
// feed-forward network
|
|
{
|
|
// norm
|
|
{
|
|
cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
|
|
offload_func(cur);
|
|
ggml_set_name(cur, "rms_norm_1");
|
|
|
|
// cur = cur*ffn_norm(broadcasted)
|
|
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
|
offload_func(cur);
|
|
ggml_set_name(cur, "ffn_norm");
|
|
}
|
|
|
|
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
|
model.layers[il].w3,
|
|
cur);
|
|
offload_func(tmp);
|
|
ggml_set_name(tmp, "result_w3");
|
|
|
|
cur = ggml_mul_mat(ctx0,
|
|
model.layers[il].w1,
|
|
cur);
|
|
offload_func(cur);
|
|
ggml_set_name(cur, "result_w1");
|
|
|
|
// SILU activation
|
|
cur = ggml_silu(ctx0, cur);
|
|
offload_func(cur);
|
|
ggml_set_name(cur, "silu");
|
|
|
|
cur = ggml_mul(ctx0, cur, tmp);
|
|
offload_func(cur);
|
|
ggml_set_name(cur, "silu_x_result_w3");
|
|
|
|
cur = ggml_mul_mat(ctx0,
|
|
model.layers[il].w2,
|
|
cur);
|
|
offload_func(cur);
|
|
ggml_set_name(cur, "result_w2");
|
|
}
|
|
|
|
cur = ggml_add(ctx0, cur, inpFF);
|
|
offload_func(cur);
|
|
ggml_set_name(cur, "inpFF_+_result_w2");
|
|
|
|
// input for next layer
|
|
inpL = cur;
|
|
}
|
|
|
|
cur = inpL;
|
|
|
|
// norm
|
|
{
|
|
cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
|
|
offload_func_nr(cur);
|
|
ggml_set_name(cur, "rms_norm_2");
|
|
|
|
// cur = cur*norm(broadcasted)
|
|
cur = ggml_mul(ctx0, cur, model.output_norm);
|
|
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
|
ggml_set_name(cur, "result_norm");
|
|
}
|
|
|
|
// lm_head
|
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
|
ggml_set_name(cur, "result_output");
|
|
|
|
ggml_build_forward_expand(gf, cur);
|
|
|
|
ggml_free(ctx0);
|
|
|
|
return gf;
|
|
}
|
|
|
|
static struct ggml_cgraph * llm_build_falcon(
|
|
llama_context & lctx,
|
|
const llama_token * tokens,
|
|
const float * embd,
|
|
int n_tokens,
|
|
int n_past) {
|
|
|
|
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
|
|
|
const int N = n_tokens;
|
|
|
|
const auto & model = lctx.model;
|
|
const auto & hparams = model.hparams;
|
|
|
|
const auto & kv_self = lctx.kv_self;
|
|
|
|
GGML_ASSERT(!!kv_self.ctx);
|
|
|
|
const int64_t n_embd = hparams.n_embd;
|
|
const int64_t n_layer = hparams.n_layer;
|
|
const int64_t n_ctx = hparams.n_ctx;
|
|
const int64_t n_head = hparams.n_head;
|
|
const int64_t n_head_kv = hparams.n_head_kv;
|
|
const int64_t n_embd_head = hparams.n_embd_head();
|
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
|
|
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
|
|
const float freq_base = hparams.rope_freq_base;
|
|
const float freq_scale = hparams.rope_freq_scale;
|
|
const float norm_eps = hparams.f_norm_eps;
|
|
|
|
const int n_gpu_layers = model.n_gpu_layers;
|
|
|
|
auto & buf_compute = lctx.buf_compute;
|
|
|
|
struct ggml_init_params params = {
|
|
/*.mem_size =*/ buf_compute.size,
|
|
/*.mem_buffer =*/ buf_compute.data,
|
|
/*.no_alloc =*/ false,
|
|
};
|
|
|
|
params.no_alloc = true;
|
|
|
|
struct ggml_context * ctx0 = ggml_init(params);
|
|
|
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
|
|
|
struct ggml_tensor * cur;
|
|
struct ggml_tensor * inpL;
|
|
|
|
if (tokens) {
|
|
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
|
|
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
|
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
|
}
|
|
ggml_set_name(inp_tokens, "inp_tokens");
|
|
|
|
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
|
} else {
|
|
#ifdef GGML_USE_MPI
|
|
GGML_ASSERT(false && "not implemented");
|
|
#endif
|
|
|
|
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
|
|
|
ggml_allocr_alloc(lctx.alloc, inpL);
|
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
|
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
|
}
|
|
}
|
|
|
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
|
(void) i_gpu_start;
|
|
|
|
// offload functions set the tensor output backend to GPU
|
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
|
//
|
|
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
|
// in that case ggml_cuda_assign_buffers has no effect
|
|
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
|
offload_func_t offload_func_kq = llama_nop;
|
|
offload_func_t offload_func_v = llama_nop;
|
|
|
|
#ifdef GGML_USE_CUBLAS
|
|
if (n_gpu_layers > n_layer) {
|
|
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
|
}
|
|
if (n_gpu_layers > n_layer + 1) {
|
|
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
|
}
|
|
if (n_gpu_layers > n_layer + 2) {
|
|
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
|
}
|
|
#endif // GGML_USE_CUBLAS
|
|
|
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
|
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
|
}
|
|
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
|
|
|
for (int il = 0; il < n_layer; ++il) {
|
|
struct ggml_tensor * attn_norm;
|
|
|
|
offload_func_t offload_func = llama_nop;
|
|
|
|
#ifdef GGML_USE_CUBLAS
|
|
if (il >= i_gpu_start) {
|
|
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
|
}
|
|
#endif // GGML_USE_CUBLAS
|
|
|
|
// self-attention
|
|
// TODO: refactor into common function (shared with LLaMA)
|
|
{
|
|
attn_norm = ggml_norm(ctx0, inpL, norm_eps);
|
|
offload_func(attn_norm);
|
|
|
|
attn_norm = ggml_add(ctx0,
|
|
ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm),
|
|
model.layers[il].attn_norm_b);
|
|
offload_func(attn_norm->src[0]);
|
|
offload_func(attn_norm);
|
|
|
|
if (model.layers[il].attn_norm_2) { // Falcon-40B
|
|
cur = ggml_norm(ctx0, inpL, norm_eps);
|
|
offload_func(cur);
|
|
|
|
cur = ggml_add(ctx0,
|
|
ggml_mul(ctx0, cur, model.layers[il].attn_norm_2),
|
|
model.layers[il].attn_norm_2_b);
|
|
offload_func(cur->src[0]);
|
|
offload_func(cur);
|
|
} else { // Falcon 7B
|
|
cur = attn_norm;
|
|
}
|
|
|
|
// compute QKV
|
|
|
|
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
|
offload_func_kq(cur);
|
|
|
|
// Note that the strides for Kcur, Vcur are set up so that the
|
|
// resulting views are misaligned with the tensor's storage
|
|
// (by applying the K/V offset we shift the tensor's original
|
|
// view to stick out behind the viewed QKV tensor's allocated
|
|
// memory, so to say). This is ok because no actual accesses
|
|
// happen to that out-of-range memory, but it can require some
|
|
// trickery when trying to accurately dump these views for
|
|
// debugging.
|
|
|
|
const size_t wsize = ggml_type_size(cur->type);
|
|
|
|
// TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
|
|
// non-contiguous views is added for the rope operator
|
|
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d(
|
|
ctx0, cur, n_embd_head, n_head, N,
|
|
wsize * n_embd_head,
|
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
|
0));
|
|
offload_func_kq(tmpq);
|
|
|
|
struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d(
|
|
ctx0, cur, n_embd_head, n_head_kv, N,
|
|
wsize * n_embd_head,
|
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
|
wsize * n_embd_head * n_head));
|
|
offload_func_kq(tmpk);
|
|
|
|
struct ggml_tensor * tmpv = ggml_view_3d(
|
|
ctx0, cur, n_embd_head, n_head_kv, N,
|
|
wsize * n_embd_head,
|
|
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
|
wsize * n_embd_head * (n_head + n_head_kv));
|
|
offload_func_v(tmpv);
|
|
|
|
// using mode = 2 for neox mode
|
|
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, tmpq, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
|
|
offload_func_kq(Qcur);
|
|
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, tmpk, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
|
|
offload_func_kq(Kcur);
|
|
|
|
{
|
|
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N));
|
|
offload_func_v(Vcur);
|
|
offload_func_v(Vcur->src[0]->src[0]);
|
|
ggml_set_name(Vcur, "Vcur");
|
|
|
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
|
|
offload_func_kq(k);
|
|
ggml_set_name(k, "k");
|
|
|
|
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
|
|
( n_ctx)*ggml_element_size(kv_self.v),
|
|
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
|
|
offload_func_v(v);
|
|
|
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
|
}
|
|
|
|
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
|
offload_func_kq(Q);
|
|
ggml_set_name(Q, "Q");
|
|
|
|
struct ggml_tensor * K =
|
|
ggml_view_3d(ctx0, kv_self.k,
|
|
n_embd_head, n_past + N, n_head_kv,
|
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
|
ggml_element_size(kv_self.k)*n_embd_head,
|
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
|
offload_func_kq(K);
|
|
ggml_set_name(K, "K");
|
|
|
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
|
offload_func_kq(KQ);
|
|
ggml_set_name(KQ, "KQ");
|
|
|
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
|
offload_func_kq(KQ_scaled);
|
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
|
|
|
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
|
offload_func_kq(KQ_masked);
|
|
ggml_set_name(KQ_masked, "KQ_masked");
|
|
|
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
|
offload_func_v(KQ_soft_max);
|
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
|
|
|
struct ggml_tensor * V =
|
|
ggml_view_3d(ctx0, kv_self.v,
|
|
n_past + N, n_embd_head, n_head_kv,
|
|
ggml_element_size(kv_self.v)*n_ctx,
|
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
|
offload_func_v(V);
|
|
ggml_set_name(V, "V");
|
|
|
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
|
offload_func_v(KQV);
|
|
ggml_set_name(KQV, "KQV");
|
|
|
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
|
offload_func_v(KQV_merged);
|
|
ggml_set_name(KQV_merged, "KQV_merged");
|
|
|
|
cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
|
offload_func_v(cur);
|
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
|
|
|
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
|
offload_func(cur);
|
|
ggml_set_name(cur, "result_wo");
|
|
}
|
|
|
|
struct ggml_tensor * attn_out = cur;
|
|
|
|
// feed forward
|
|
{
|
|
struct ggml_tensor * inpFF = attn_norm;
|
|
|
|
cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF);
|
|
offload_func(cur);
|
|
|
|
cur = ggml_gelu(ctx0, cur);
|
|
offload_func(cur);
|
|
cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
|
|
offload_func(cur);
|
|
}
|
|
|
|
cur = ggml_add(ctx0, cur, attn_out);
|
|
offload_func(cur);
|
|
cur = ggml_add(ctx0, cur, inpL);
|
|
offload_func(cur);
|
|
|
|
// input for next layer
|
|
inpL = cur;
|
|
}
|
|
|
|
cur = inpL;
|
|
|
|
// norm
|
|
{
|
|
cur = ggml_norm(ctx0, cur, norm_eps);
|
|
offload_func_nr(cur);
|
|
|
|
cur = ggml_add(ctx0,
|
|
ggml_mul(ctx0, cur, model.output_norm),
|
|
model.output_norm_b);
|
|
ggml_set_name(cur, "result_norm");
|
|
}
|
|
|
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
|
ggml_set_name(cur, "result_output");
|
|
|
|
ggml_build_forward_expand(gf, cur);
|
|
|
|
ggml_free(ctx0);
|
|
|
|
return gf;
|
|
}
|
|
|
|
static struct ggml_cgraph * llama_build_graph(
|
|
llama_context & lctx,
|
|
const llama_token * tokens,
|
|
const float * embd,
|
|
int n_tokens,
|
|
int n_past) {
|
|
const auto & model = lctx.model;
|
|
|
|
struct ggml_cgraph * result = NULL;
|
|
|
|
switch (model.arch) {
|
|
case LLM_ARCH_LLAMA:
|
|
{
|
|
result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
|
|
} break;
|
|
case LLM_ARCH_FALCON:
|
|
{
|
|
result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
|
|
} break;
|
|
default:
|
|
GGML_ASSERT(false);
|
|
};
|
|
|
|
return result;
|
|
}
|
|
|
|
// evaluate the transformer
|
|
//
|
|
// - lctx: llama context
|
|
// - tokens: new batch of tokens to process
|
|
// - embd embeddings input
|
|
// - n_tokens number of tokens
|
|
// - n_past: the context size so far
|
|
// - n_threads: number of threads to use
|
|
//
|
|
static bool llama_eval_internal(
|
|
llama_context & lctx,
|
|
const llama_token * tokens,
|
|
const float * embd,
|
|
int n_tokens,
|
|
int n_past,
|
|
int n_threads,
|
|
const char * cgraph_fname) {
|
|
|
|
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
|
|
|
GGML_ASSERT(n_tokens > 0);
|
|
GGML_ASSERT(n_past >= 0);
|
|
// TODO: keep the values of n_batch and n_ctx
|
|
// GGML_ASSERT(n_tokens <= n_batch);
|
|
// GGML_ASSERT(n_past + n_tokens <= n_ctx);
|
|
|
|
const int64_t t_start_us = ggml_time_us();
|
|
|
|
#ifdef GGML_USE_MPI
|
|
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
|
#endif
|
|
|
|
GGML_ASSERT(n_threads > 0);
|
|
|
|
const int N = n_tokens;
|
|
|
|
const auto & model = lctx.model;
|
|
const auto & hparams = model.hparams;
|
|
|
|
const auto & kv_self = lctx.kv_self;
|
|
|
|
GGML_ASSERT(!!kv_self.ctx);
|
|
|
|
const int64_t n_embd = hparams.n_embd;
|
|
const int64_t n_vocab = hparams.n_vocab;
|
|
|
|
ggml_allocr_reset(lctx.alloc);
|
|
|
|
ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
|
|
|
|
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
|
|
|
#ifdef GGML_USE_CUBLAS
|
|
for (int i = 0; i < gf->n_leafs; i++) {
|
|
ggml_tensor * node = gf->leafs[i];
|
|
if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
|
|
ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < gf->n_nodes; i++) {
|
|
ggml_tensor * node = gf->nodes[i];
|
|
if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
|
|
ggml_cuda_assign_scratch_offset(node, (char*)node->data - (char *) lctx.buf_alloc.data);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
|
|
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
|
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
|
|
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
|
|
|
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
|
|
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
|
|
|
#if GGML_USE_MPI
|
|
const int64_t n_layer = hparams.n_layer;
|
|
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
|
#endif
|
|
|
|
#ifdef GGML_USE_METAL
|
|
if (lctx.ctx_metal) {
|
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
|
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
|
ggml_metal_get_tensor (lctx.ctx_metal, res);
|
|
if (!lctx.embedding.empty()) {
|
|
ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
|
|
}
|
|
} else {
|
|
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
|
}
|
|
#else
|
|
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
|
#endif
|
|
|
|
#if GGML_USE_MPI
|
|
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
|
#endif
|
|
|
|
// update kv token count
|
|
lctx.kv_self.n = n_past + N;
|
|
|
|
if (cgraph_fname) {
|
|
ggml_graph_export(gf, cgraph_fname);
|
|
}
|
|
|
|
#ifdef GGML_PERF
|
|
// print timing information per ggml operation (for debugging purposes)
|
|
// requires GGML_PERF to be defined
|
|
ggml_graph_print(gf);
|
|
#endif
|
|
|
|
// plot the computation graph in dot format (for debugging purposes)
|
|
//if (n_past%100 == 0) {
|
|
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
|
//}
|
|
|
|
// extract logits
|
|
{
|
|
auto & logits_out = lctx.logits;
|
|
|
|
if (lctx.logits_all) {
|
|
logits_out.resize(n_vocab * N);
|
|
memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
|
|
} else {
|
|
// return result for just the last token
|
|
logits_out.resize(n_vocab);
|
|
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
|
}
|
|
}
|
|
|
|
// extract embeddings
|
|
if (!lctx.embedding.empty()) {
|
|
auto & embedding_out = lctx.embedding;
|
|
|
|
embedding_out.resize(n_embd);
|
|
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
|
|
}
|
|
|
|
// measure the performance only for the single-token evals
|
|
if (N == 1) {
|
|
lctx.t_eval_us += ggml_time_us() - t_start_us;
|
|
lctx.n_eval++;
|
|
}
|
|
else if (N > 1) {
|
|
lctx.t_p_eval_us += ggml_time_us() - t_start_us;
|
|
lctx.n_p_eval += N;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
//
|
|
// tokenizer
|
|
//
|
|
|
|
static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
|
|
return vocab.type;
|
|
}
|
|
|
|
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
|
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
|
|
}
|
|
|
|
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
|
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
|
|
}
|
|
|
|
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
|
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
|
|
}
|
|
|
|
static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
|
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
|
|
}
|
|
|
|
static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
|
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNUSED;
|
|
}
|
|
|
|
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
|
|
}
|
|
|
|
static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
|
|
GGML_ASSERT(llama_is_control_token(vocab, id));
|
|
return id == vocab.special_bos_id;
|
|
}
|
|
|
|
static bool llama_is_eos_token(const llama_vocab & vocab, llama_token id ) {
|
|
GGML_ASSERT(llama_is_control_token(vocab, id));
|
|
return id == vocab.special_eos_id;
|
|
}
|
|
|
|
static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
|
|
GGML_ASSERT(id < 0 || llama_is_control_token(vocab, id));
|
|
return id == vocab.special_pad_id;
|
|
}
|
|
|
|
static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
|
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
|
const auto& token_data = vocab.id_to_token.at(id);
|
|
auto buf = token_data.text.substr(3, 2);
|
|
return strtol(buf.c_str(), NULL, 16);
|
|
}
|
|
|
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
|
char buf[7];
|
|
int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
|
|
GGML_ASSERT(0 <= result && result < 7);
|
|
return vocab.token_to_id.at(buf);
|
|
}
|
|
|
|
static void llama_escape_whitespace(std::string & text) {
|
|
replace_all(text, " ", "\xe2\x96\x81");
|
|
}
|
|
|
|
static void llama_unescape_whitespace(std::string & word) {
|
|
replace_all(word, "\xe2\x96\x81", " ");
|
|
}
|
|
|
|
struct llm_symbol {
|
|
using index = int;
|
|
index prev;
|
|
index next;
|
|
const char * text;
|
|
size_t n;
|
|
};
|
|
|
|
static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not trivially copyable");
|
|
|
|
// SPM tokenizer
|
|
// original implementation:
|
|
// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
|
|
|
|
struct llm_bigram_spm {
|
|
struct comparator {
|
|
bool operator()(llm_bigram_spm & l, llm_bigram_spm & r) {
|
|
return (l.score < r.score) || (l.score == r.score && l.left > r.left);
|
|
}
|
|
};
|
|
using queue_storage = std::vector<llm_bigram_spm>;
|
|
using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
|
|
llm_symbol::index left;
|
|
llm_symbol::index right;
|
|
float score;
|
|
size_t size;
|
|
};
|
|
|
|
struct llm_tokenizer_spm {
|
|
llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {}
|
|
|
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
|
// split string into utf8 chars
|
|
int index = 0;
|
|
size_t offs = 0;
|
|
while (offs < text.size()) {
|
|
llm_symbol sym;
|
|
size_t len = utf8_len(text[offs]);
|
|
GGML_ASSERT(offs + len <= text.size());
|
|
sym.text = text.c_str() + offs;
|
|
sym.n = len;
|
|
offs += len;
|
|
sym.prev = index - 1;
|
|
sym.next = offs == text.size() ? -1 : index + 1;
|
|
index++;
|
|
symbols.emplace_back(sym);
|
|
}
|
|
|
|
// seed the work queue with all possible 2-character tokens.
|
|
for (size_t i = 1; i < symbols.size(); ++i) {
|
|
try_add_bigram(i - 1, i);
|
|
}
|
|
|
|
// keep substituting the highest frequency pairs for as long as we can.
|
|
while (!work_queue.empty()) {
|
|
auto bigram = work_queue.top();
|
|
work_queue.pop();
|
|
|
|
auto & left_sym = symbols[bigram.left];
|
|
auto & right_sym = symbols[bigram.right];
|
|
|
|
// if one of the symbols already got merged, skip it.
|
|
if (left_sym.n == 0 || right_sym.n == 0 ||
|
|
left_sym.n + right_sym.n != bigram.size) {
|
|
continue;
|
|
}
|
|
|
|
// merge the right sym into the left one
|
|
left_sym.n += right_sym.n;
|
|
right_sym.n = 0;
|
|
|
|
//LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
|
|
|
|
// remove the right sym from the chain
|
|
left_sym.next = right_sym.next;
|
|
if (right_sym.next >= 0) {
|
|
symbols[right_sym.next].prev = bigram.left;
|
|
}
|
|
|
|
// find more substitutions
|
|
try_add_bigram(left_sym.prev, bigram.left);
|
|
try_add_bigram(bigram.left, left_sym.next);
|
|
}
|
|
|
|
for (int i = 0; i != -1; i = symbols[i].next) {
|
|
auto & symbol = symbols[i];
|
|
resegment(symbol, output);
|
|
}
|
|
}
|
|
|
|
private:
|
|
void resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) {
|
|
auto text = std::string(symbol.text, symbol.n);
|
|
auto token = vocab.token_to_id.find(text);
|
|
|
|
// Do we need to support is_unused?
|
|
if (token != vocab.token_to_id.end()) {
|
|
output.push_back((*token).second);
|
|
return;
|
|
}
|
|
|
|
const auto p = rev_merge.find(text);
|
|
|
|
if (p == rev_merge.end()) {
|
|
// output any symbols that did not form tokens as bytes.
|
|
for (int j = 0; j < (int)symbol.n; ++j) {
|
|
llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]);
|
|
output.push_back(token_id);
|
|
}
|
|
return;
|
|
}
|
|
|
|
resegment(symbols[p->second.first], output);
|
|
resegment(symbols[p->second.second], output);
|
|
}
|
|
|
|
void try_add_bigram(int left, int right) {
|
|
if (left == -1 || right == -1) {
|
|
return;
|
|
}
|
|
|
|
const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n);
|
|
auto token = vocab.token_to_id.find(text);
|
|
|
|
if (token == vocab.token_to_id.end()) {
|
|
return;
|
|
}
|
|
|
|
if (static_cast<size_t>((*token).second) >= vocab.id_to_token.size()) {
|
|
return;
|
|
}
|
|
|
|
const auto & tok_data = vocab.id_to_token[(*token).second];
|
|
|
|
llm_bigram_spm bigram;
|
|
bigram.left = left;
|
|
bigram.right = right;
|
|
bigram.score = tok_data.score;
|
|
bigram.size = text.size();
|
|
|
|
work_queue.push(bigram);
|
|
|
|
// Do we need to support is_unused?
|
|
rev_merge[text] = std::make_pair(left, right);
|
|
}
|
|
|
|
const llama_vocab & vocab;
|
|
|
|
std::vector<llm_symbol> symbols;
|
|
llm_bigram_spm::queue work_queue;
|
|
|
|
std::map<std::string, std::pair<int, int>> rev_merge;
|
|
};
|
|
|
|
// BPE tokenizer
|
|
// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
|
|
// tried to simplify unicode stuff, so most likely does not work 100% correctly!
|
|
|
|
// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
|
|
|
|
struct llm_bigram_bpe {
|
|
struct comparator {
|
|
bool operator()(llm_bigram_bpe & l, llm_bigram_bpe & r) {
|
|
return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
|
|
}
|
|
};
|
|
|
|
using queue_storage = std::vector<llm_bigram_bpe>;
|
|
using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;
|
|
llm_symbol::index left;
|
|
llm_symbol::index right;
|
|
std::string text;
|
|
int rank;
|
|
size_t size;
|
|
};
|
|
|
|
struct llm_tokenizer_bpe {
|
|
llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}
|
|
|
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
|
int final_prev_index = -1;
|
|
auto word_collection = bpe_gpt2_preprocess(text);
|
|
|
|
symbols_final.clear();
|
|
|
|
for (auto & word : word_collection) {
|
|
work_queue = llm_bigram_bpe::queue();
|
|
symbols.clear();
|
|
|
|
int index = 0;
|
|
size_t offset = 0;
|
|
|
|
while (offset < word.size()) {
|
|
llm_symbol sym;
|
|
size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
|
|
sym.text = word.c_str() + offset;
|
|
sym.n = 1;
|
|
sym.n = char_len;
|
|
offset += sym.n;
|
|
sym.prev = index - 1;
|
|
sym.next = offset == word.size() ? -1 : index + 1;
|
|
index++;
|
|
symbols.emplace_back(sym);
|
|
}
|
|
for (size_t i = 1; i < symbols.size(); ++i) {
|
|
add_new_bigram(i - 1, i);
|
|
}
|
|
|
|
// build token(s)
|
|
while (!work_queue.empty()) {
|
|
auto bigram = work_queue.top();
|
|
work_queue.pop();
|
|
|
|
auto & left_symbol = symbols[bigram.left];
|
|
auto & right_symbol = symbols[bigram.right];
|
|
|
|
if (left_symbol.n == 0 || right_symbol.n == 0) {
|
|
continue;
|
|
}
|
|
std::string left_token = std::string(left_symbol.text, left_symbol.n);
|
|
std::string right_token = std::string(right_symbol.text, right_symbol.n);
|
|
if (left_token + right_token != bigram.text) {
|
|
continue; // Skip this bigram if it's outdated
|
|
}
|
|
|
|
// merge the right sym into the left one
|
|
left_symbol.n += right_symbol.n;
|
|
right_symbol.n = 0;
|
|
|
|
// remove the right sym from the chain
|
|
left_symbol.next = right_symbol.next;
|
|
if (right_symbol.next >= 0) {
|
|
symbols[right_symbol.next].prev = bigram.left;
|
|
}
|
|
|
|
add_new_bigram(left_symbol.prev, bigram.left); // left side of current symbol
|
|
add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol
|
|
}
|
|
|
|
// add the fnished tokens to the final list keeping correct order for next and prev
|
|
for (auto & sym : symbols) {
|
|
if (sym.n > 0) {
|
|
sym.prev = final_prev_index;
|
|
sym.next = -1;
|
|
if (final_prev_index != -1) {
|
|
symbols_final[final_prev_index].next = symbols_final.size();
|
|
}
|
|
symbols_final.emplace_back(sym);
|
|
final_prev_index = symbols_final.size() - 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
symbols = symbols_final;
|
|
|
|
if (!symbols.empty()) {
|
|
for (int i = 0; i != -1; i = symbols[i].next) {
|
|
auto & symbol = symbols[i];
|
|
if (symbol.n == 0) {
|
|
continue;
|
|
}
|
|
|
|
const std::string str = std::string(symbol.text, symbol.n);
|
|
const auto token = vocab.token_to_id.find(str);
|
|
|
|
if (token == vocab.token_to_id.end()) {
|
|
for (auto j = str.begin(); j != str.end(); ++j) {
|
|
std::string byte_str(1, *j);
|
|
auto token_multibyte = vocab.token_to_id.find(byte_str);
|
|
if (token_multibyte == vocab.token_to_id.end()) {
|
|
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
|
|
}
|
|
output.push_back((*token_multibyte).second);
|
|
}
|
|
} else {
|
|
output.push_back((*token).second);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
private:
|
|
void add_new_bigram(int left, int right) {
|
|
if (left == -1 || right == -1) {
|
|
return;
|
|
}
|
|
|
|
std::string left_token = std::string(symbols[left].text, symbols[left].n);
|
|
std::string right_token = std::string(symbols[right].text, symbols[right].n);
|
|
|
|
int rank_found = -1;
|
|
|
|
rank_found = vocab.find_bpe_rank(left_token, right_token);
|
|
|
|
if (rank_found < 0) {
|
|
return;
|
|
}
|
|
|
|
llm_bigram_bpe bigram;
|
|
|
|
bigram.left = left;
|
|
bigram.right = right;
|
|
bigram.text = left_token + right_token;
|
|
bigram.size = left_token.size() + right_token.size();
|
|
bigram.rank = rank_found;
|
|
|
|
work_queue.push(bigram);
|
|
}
|
|
|
|
// probably not 100% correct
|
|
// TODO: this is quite slow - how to make it more efficient?
|
|
static std::vector<std::string> bpe_gpt2_preprocess(std::string text) {
|
|
std::vector<std::string> words;
|
|
|
|
// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
|
|
const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
|
const std::regex re(pattern);
|
|
std::smatch m;
|
|
|
|
while (std::regex_search(text, m, re)) {
|
|
for (auto x : m) {
|
|
words.push_back(x);
|
|
}
|
|
text = m.suffix();
|
|
}
|
|
|
|
return words;
|
|
}
|
|
|
|
const llama_vocab & vocab;
|
|
|
|
std::vector<llm_symbol> symbols;
|
|
std::vector<llm_symbol> symbols_final;
|
|
|
|
llm_bigram_bpe::queue work_queue;
|
|
};
|
|
|
|
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) {
|
|
std::vector<llama_vocab::id> output;
|
|
|
|
// OG tokenizer behavior:
|
|
//
|
|
// tokenizer.encode('', add_bos=True) returns [1]
|
|
// tokenizer.encode('', add_bos=False) returns []
|
|
|
|
if (bos && vocab.special_bos_id != -1) {
|
|
output.push_back(vocab.special_bos_id);
|
|
}
|
|
|
|
if (raw_text.empty()) {
|
|
return output;
|
|
}
|
|
|
|
switch (vocab.type) {
|
|
case LLAMA_VOCAB_TYPE_SPM:
|
|
{
|
|
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
|
raw_text = " " + raw_text;
|
|
|
|
llm_tokenizer_spm tokenizer(vocab);
|
|
llama_escape_whitespace(raw_text);
|
|
tokenizer.tokenize(raw_text, output);
|
|
} break;
|
|
case LLAMA_VOCAB_TYPE_BPE:
|
|
{
|
|
llm_tokenizer_bpe tokenizer(vocab);
|
|
tokenizer.tokenize(raw_text, output);
|
|
} break;
|
|
};
|
|
|
|
return output;
|
|
}
|
|
|
|
//
|
|
// grammar - internal
|
|
//
|
|
|
|
struct llama_partial_utf8 {
|
|
uint32_t value; // bit value so far (unshifted)
|
|
int n_remain; // num bytes remaining; -1 indicates invalid sequence
|
|
};
|
|
|
|
struct llama_grammar {
|
|
const std::vector<std::vector<llama_grammar_element>> rules;
|
|
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
|
|
|
// buffer for partially generated UTF-8 sequence from accepted tokens
|
|
llama_partial_utf8 partial_utf8;
|
|
};
|
|
|
|
struct llama_grammar_candidate {
|
|
size_t index;
|
|
const uint32_t * code_points;
|
|
llama_partial_utf8 partial_utf8;
|
|
};
|
|
|
|
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
|
|
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
|
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
|
const char * src,
|
|
llama_partial_utf8 partial_start) {
|
|
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
|
const char * pos = src;
|
|
std::vector<uint32_t> code_points;
|
|
uint32_t value = partial_start.value;
|
|
int n_remain = partial_start.n_remain;
|
|
|
|
// continue previous decode, if applicable
|
|
while (*pos != 0 && n_remain > 0) {
|
|
uint8_t next_byte = static_cast<uint8_t>(*pos);
|
|
if ((next_byte >> 6) != 2) {
|
|
// invalid sequence, abort
|
|
code_points.push_back(0);
|
|
return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, -1 });
|
|
}
|
|
value = (value << 6) + (next_byte & 0x3F);
|
|
++pos;
|
|
--n_remain;
|
|
}
|
|
|
|
if (partial_start.n_remain > 0 && n_remain == 0) {
|
|
code_points.push_back(value);
|
|
}
|
|
|
|
// decode any subsequent utf-8 sequences, which may end in an incomplete one
|
|
while (*pos != 0) {
|
|
uint8_t first_byte = static_cast<uint8_t>(*pos);
|
|
uint8_t highbits = first_byte >> 4;
|
|
n_remain = lookup[highbits] - 1;
|
|
|
|
if (n_remain < 0) {
|
|
// invalid sequence, abort
|
|
code_points.clear();
|
|
code_points.push_back(0);
|
|
return std::make_pair(std::move(code_points), llama_partial_utf8{ 0, n_remain });
|
|
}
|
|
|
|
uint8_t mask = (1 << (7 - n_remain)) - 1;
|
|
value = first_byte & mask;
|
|
++pos;
|
|
while (*pos != 0 && n_remain > 0) {
|
|
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
|
|
++pos;
|
|
--n_remain;
|
|
}
|
|
if (n_remain == 0) {
|
|
code_points.push_back(value);
|
|
}
|
|
}
|
|
code_points.push_back(0);
|
|
|
|
return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
|
|
}
|
|
|
|
// returns true iff pos points to the end of one of the definitions of a rule
|
|
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
|
|
switch (pos->type) {
|
|
case LLAMA_GRETYPE_END: return true; // NOLINT
|
|
case LLAMA_GRETYPE_ALT: return true; // NOLINT
|
|
default: return false;
|
|
}
|
|
}
|
|
|
|
// returns true iff chr satisfies the char range at pos (regular or inverse range)
|
|
// asserts that pos is pointing to a char range element
|
|
static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
|
const llama_grammar_element * pos,
|
|
const uint32_t chr) {
|
|
|
|
bool found = false;
|
|
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
|
|
|
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
|
|
|
|
do {
|
|
if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
|
|
// inclusive range, e.g. [a-z]
|
|
found = found || (pos->value <= chr && chr <= pos[1].value);
|
|
pos += 2;
|
|
} else {
|
|
// exact char match, e.g. [a] or "a"
|
|
found = found || pos->value == chr;
|
|
pos += 1;
|
|
}
|
|
} while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
|
|
|
|
return std::make_pair(found == is_positive_char, pos);
|
|
}
|
|
|
|
// returns true iff some continuation of the given partial UTF-8 sequence could satisfy the char
|
|
// range at pos (regular or inverse range)
|
|
// asserts that pos is pointing to a char range element
|
|
static bool llama_grammar_match_partial_char(
|
|
const llama_grammar_element * pos,
|
|
const llama_partial_utf8 partial_utf8) {
|
|
|
|
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
|
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
|
|
|
|
uint32_t partial_value = partial_utf8.value;
|
|
int n_remain = partial_utf8.n_remain;
|
|
|
|
// invalid sequence or 7-bit char split across 2 bytes (overlong)
|
|
if (n_remain < 0 || (n_remain == 1 && partial_value < 2)) {
|
|
return false;
|
|
}
|
|
|
|
// range of possible code points this partial UTF-8 sequence could complete to
|
|
uint32_t low = partial_value << (n_remain * 6);
|
|
uint32_t high = low | ((1 << (n_remain * 6)) - 1);
|
|
|
|
if (low == 0) {
|
|
if (n_remain == 2) {
|
|
low = 1 << 11;
|
|
} else if (n_remain == 3) {
|
|
low = 1 << 16;
|
|
}
|
|
}
|
|
|
|
do {
|
|
if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
|
|
// inclusive range, e.g. [a-z]
|
|
if (pos->value <= high && low <= pos[1].value) {
|
|
return is_positive_char;
|
|
}
|
|
pos += 2;
|
|
} else {
|
|
// exact char match, e.g. [a] or "a"
|
|
if (low <= pos->value && pos->value <= high) {
|
|
return is_positive_char;
|
|
}
|
|
pos += 1;
|
|
}
|
|
} while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
|
|
|
|
return !is_positive_char;
|
|
}
|
|
|
|
|
|
// transforms a grammar pushdown stack into N possible stacks, all ending
|
|
// at a character range (terminal element)
|
|
static void llama_grammar_advance_stack(
|
|
const std::vector<std::vector<llama_grammar_element>> & rules,
|
|
const std::vector<const llama_grammar_element *> & stack,
|
|
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
|
|
|
if (stack.empty()) {
|
|
new_stacks.push_back(stack);
|
|
return;
|
|
}
|
|
|
|
const llama_grammar_element * pos = stack.back();
|
|
|
|
switch (pos->type) {
|
|
case LLAMA_GRETYPE_RULE_REF: {
|
|
const size_t rule_id = static_cast<size_t>(pos->value);
|
|
const llama_grammar_element * subpos = rules[rule_id].data();
|
|
do {
|
|
// init new stack without the top (pos)
|
|
std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
|
|
if (!llama_grammar_is_end_of_sequence(pos + 1)) {
|
|
// if this rule ref is followed by another element, add that to stack
|
|
new_stack.push_back(pos + 1);
|
|
}
|
|
if (!llama_grammar_is_end_of_sequence(subpos)) {
|
|
// if alternate is nonempty, add to stack
|
|
new_stack.push_back(subpos);
|
|
}
|
|
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
|
while (!llama_grammar_is_end_of_sequence(subpos)) {
|
|
// scan to end of alternate def
|
|
subpos++;
|
|
}
|
|
if (subpos->type == LLAMA_GRETYPE_ALT) {
|
|
// there's another alternate def of this rule to process
|
|
subpos++;
|
|
} else {
|
|
break;
|
|
}
|
|
} while (true);
|
|
break;
|
|
}
|
|
case LLAMA_GRETYPE_CHAR:
|
|
case LLAMA_GRETYPE_CHAR_NOT:
|
|
new_stacks.push_back(stack);
|
|
break;
|
|
default:
|
|
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
|
// (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
|
|
// those
|
|
GGML_ASSERT(false);
|
|
}
|
|
}
|
|
|
|
// takes a set of possible pushdown stacks on a grammar, which are required to
|
|
// be positioned at a character range (see `llama_grammar_advance_stack`), and
|
|
// produces the N possible stacks if the given char is accepted at those
|
|
// positions
|
|
static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
|
|
const std::vector<std::vector<llama_grammar_element>> & rules,
|
|
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
|
const uint32_t chr) {
|
|
|
|
std::vector<std::vector<const llama_grammar_element *>> new_stacks;
|
|
|
|
for (const auto & stack : stacks) {
|
|
if (stack.empty()) {
|
|
continue;
|
|
}
|
|
|
|
auto match = llama_grammar_match_char(stack.back(), chr);
|
|
if (match.first) {
|
|
const llama_grammar_element * pos = match.second;
|
|
|
|
// update top of stack to next element, if any
|
|
std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
|
|
if (!llama_grammar_is_end_of_sequence(pos)) {
|
|
new_stack.push_back(pos);
|
|
}
|
|
llama_grammar_advance_stack(rules, new_stack, new_stacks);
|
|
}
|
|
}
|
|
|
|
return new_stacks;
|
|
}
|
|
|
|
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
|
const std::vector<std::vector<llama_grammar_element>> & rules,
|
|
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
|
const std::vector<llama_grammar_candidate> & candidates);
|
|
|
|
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
|
|
const std::vector<std::vector<llama_grammar_element>> & rules,
|
|
const std::vector<const llama_grammar_element *> & stack,
|
|
const std::vector<llama_grammar_candidate> & candidates) {
|
|
|
|
std::vector<llama_grammar_candidate> rejects;
|
|
|
|
if (stack.empty()) {
|
|
for (auto tok : candidates) {
|
|
if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
|
|
rejects.push_back(tok);
|
|
}
|
|
}
|
|
return rejects;
|
|
}
|
|
|
|
const llama_grammar_element * stack_pos = stack.back();
|
|
|
|
std::vector<llama_grammar_candidate> next_candidates;
|
|
for (auto tok : candidates) {
|
|
if (*tok.code_points == 0) {
|
|
// reached end of full codepoints in token, reject iff it ended in a partial sequence
|
|
// that cannot satisfy this position in grammar
|
|
if (tok.partial_utf8.n_remain != 0 &&
|
|
!llama_grammar_match_partial_char(stack_pos, tok.partial_utf8)) {
|
|
rejects.push_back(tok);
|
|
}
|
|
} else if (llama_grammar_match_char(stack_pos, *tok.code_points).first) {
|
|
next_candidates.push_back({ tok.index, tok.code_points + 1, tok.partial_utf8 });
|
|
} else {
|
|
rejects.push_back(tok);
|
|
}
|
|
}
|
|
|
|
const auto * stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
|
|
|
|
// update top of stack to next element, if any
|
|
std::vector<const llama_grammar_element *> stack_after(stack.begin(), stack.end() - 1);
|
|
if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
|
|
stack_after.push_back(stack_pos_after);
|
|
}
|
|
std::vector<std::vector<const llama_grammar_element *>> next_stacks;
|
|
llama_grammar_advance_stack(rules, stack_after, next_stacks);
|
|
|
|
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
|
for (auto tok : next_rejects) {
|
|
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
|
|
}
|
|
|
|
return rejects;
|
|
}
|
|
|
|
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
|
const std::vector<std::vector<llama_grammar_element>> & rules,
|
|
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
|
const std::vector<llama_grammar_candidate> & candidates) {
|
|
GGML_ASSERT(!stacks.empty()); // REVIEW
|
|
|
|
if (candidates.empty()) {
|
|
return std::vector<llama_grammar_candidate>();
|
|
}
|
|
|
|
auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
|
|
|
|
for (size_t i = 1, size = stacks.size(); i < size; ++i) {
|
|
rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
|
|
}
|
|
return rejects;
|
|
}
|
|
|
|
//
|
|
// grammar - external
|
|
//
|
|
|
|
struct llama_grammar * llama_grammar_init(
|
|
const llama_grammar_element ** rules,
|
|
size_t n_rules,
|
|
size_t start_rule_index) {
|
|
const llama_grammar_element * pos;
|
|
|
|
// copy rule definitions into vectors
|
|
std::vector<std::vector<llama_grammar_element>> vec_rules(n_rules);
|
|
for (size_t i = 0; i < n_rules; i++) {
|
|
for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
|
|
vec_rules[i].push_back(*pos);
|
|
}
|
|
vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
|
|
}
|
|
|
|
// loop over alternates of start rule to build initial stacks
|
|
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
|
pos = rules[start_rule_index];
|
|
do {
|
|
std::vector<const llama_grammar_element *> stack;
|
|
if (!llama_grammar_is_end_of_sequence(pos)) {
|
|
// if alternate is nonempty, add to stack
|
|
stack.push_back(pos);
|
|
}
|
|
llama_grammar_advance_stack(vec_rules, stack, stacks);
|
|
while (!llama_grammar_is_end_of_sequence(pos)) {
|
|
// scan to end of alternate def
|
|
pos++;
|
|
}
|
|
if (pos->type == LLAMA_GRETYPE_ALT) {
|
|
// there's another alternate def of this rule to process
|
|
pos++;
|
|
} else {
|
|
break;
|
|
}
|
|
} while (true);
|
|
|
|
return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
|
|
}
|
|
|
|
void llama_grammar_free(struct llama_grammar * grammar) {
|
|
delete grammar;
|
|
}
|
|
|
|
//
|
|
// sampling
|
|
//
|
|
|
|
void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
|
|
GGML_ASSERT(candidates->size > 0);
|
|
|
|
const int64_t t_start_sample_us = ggml_time_us();
|
|
|
|
// Sort the logits in descending order
|
|
if (!candidates->sorted) {
|
|
std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
|
return a.logit > b.logit;
|
|
});
|
|
candidates->sorted = true;
|
|
}
|
|
|
|
float max_l = candidates->data[0].logit;
|
|
float cum_sum = 0.0f;
|
|
for (size_t i = 0; i < candidates->size; ++i) {
|
|
float p = expf(candidates->data[i].logit - max_l);
|
|
candidates->data[i].p = p;
|
|
cum_sum += p;
|
|
}
|
|
for (size_t i = 0; i < candidates->size; ++i) {
|
|
candidates->data[i].p /= cum_sum;
|
|
}
|
|
|
|
if (ctx) {
|
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
}
|
|
}
|
|
|
|
void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) {
|
|
const int64_t t_start_sample_us = ggml_time_us();
|
|
|
|
k = std::max(k, (int) min_keep);
|
|
k = std::min(k, (int) candidates->size);
|
|
|
|
// Sort scores in descending order
|
|
if (!candidates->sorted) {
|
|
auto comp = [](const llama_token_data & a, const llama_token_data & b) {
|
|
return a.logit > b.logit;
|
|
};
|
|
if (k == (int) candidates->size) {
|
|
std::sort(candidates->data, candidates->data + candidates->size, comp);
|
|
} else {
|
|
std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
|
|
}
|
|
candidates->sorted = true;
|
|
}
|
|
candidates->size = k;
|
|
|
|
if (ctx) {
|
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
}
|
|
}
|
|
|
|
void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
|
|
if (p >= 1.0f) {
|
|
return;
|
|
}
|
|
|
|
llama_sample_softmax(ctx, candidates);
|
|
|
|
const int64_t t_start_sample_us = ggml_time_us();
|
|
|
|
// Compute the cumulative probabilities
|
|
float cum_sum = 0.0f;
|
|
size_t last_idx = candidates->size;
|
|
|
|
for (size_t i = 0; i < candidates->size; ++i) {
|
|
cum_sum += candidates->data[i].p;
|
|
|
|
// Check if the running sum is at least p or if we have kept at least min_keep tokens
|
|
// we set the last index to i+1 to indicate that the current iterate should be included in the set
|
|
if (cum_sum >= p && i + 1 >= min_keep) {
|
|
last_idx = i + 1;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Resize the output vector to keep only the top-p tokens
|
|
candidates->size = last_idx;
|
|
|
|
if (ctx) {
|
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
}
|
|
}
|
|
|
|
void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
|
|
if (z >= 1.0f || candidates->size <= 2) {
|
|
return;
|
|
}
|
|
|
|
llama_sample_softmax(nullptr, candidates);
|
|
const int64_t t_start_sample_us = ggml_time_us();
|
|
|
|
// Compute the first and second derivatives
|
|
std::vector<float> first_derivatives(candidates->size - 1);
|
|
std::vector<float> second_derivatives(candidates->size - 2);
|
|
|
|
for (size_t i = 0; i < first_derivatives.size(); ++i) {
|
|
first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p;
|
|
}
|
|
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
|
second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
|
|
}
|
|
|
|
// Calculate absolute value of second derivatives
|
|
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
|
second_derivatives[i] = std::abs(second_derivatives[i]);
|
|
}
|
|
|
|
// Normalize the second derivatives
|
|
{
|
|
const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
|
|
|
|
if (second_derivatives_sum > 1e-6f) {
|
|
for (float & value : second_derivatives) {
|
|
value /= second_derivatives_sum;
|
|
}
|
|
} else {
|
|
for (float & value : second_derivatives) {
|
|
value = 1.0f / second_derivatives.size();
|
|
}
|
|
}
|
|
}
|
|
|
|
float cum_sum = 0.0f;
|
|
size_t last_idx = candidates->size;
|
|
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
|
cum_sum += second_derivatives[i];
|
|
|
|
// Check if the running sum is greater than z or if we have kept at least min_keep tokens
|
|
if (cum_sum > z && i >= min_keep) {
|
|
last_idx = i;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Resize the output vector to keep only the tokens above the tail location
|
|
candidates->size = last_idx;
|
|
|
|
if (ctx) {
|
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
}
|
|
}
|
|
|
|
void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
|
|
// Reference implementation:
|
|
// https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
|
|
if (p >= 1.0f) {
|
|
return;
|
|
}
|
|
|
|
// Compute the softmax of logits and calculate entropy
|
|
llama_sample_softmax(nullptr, candidates);
|
|
|
|
const int64_t t_start_sample_us = ggml_time_us();
|
|
|
|
float entropy = 0.0f;
|
|
for (size_t i = 0; i < candidates->size; ++i) {
|
|
entropy += -candidates->data[i].p * logf(candidates->data[i].p);
|
|
}
|
|
|
|
// Compute the absolute difference between negative log probability and entropy for each candidate
|
|
std::vector<float> shifted_scores;
|
|
for (size_t i = 0; i < candidates->size; ++i) {
|
|
float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy);
|
|
shifted_scores.push_back(shifted_score);
|
|
}
|
|
|
|
// Sort tokens based on the shifted_scores and their corresponding indices
|
|
std::vector<size_t> indices(candidates->size);
|
|
std::iota(indices.begin(), indices.end(), 0);
|
|
|
|
std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
|
|
return shifted_scores[a] < shifted_scores[b];
|
|
});
|
|
|
|
// Compute the cumulative probabilities
|
|
float cum_sum = 0.0f;
|
|
size_t last_idx = indices.size();
|
|
|
|
for (size_t i = 0; i < indices.size(); ++i) {
|
|
size_t idx = indices[i];
|
|
cum_sum += candidates->data[idx].p;
|
|
|
|
// Check if the running sum is greater than typical or if we have kept at least min_keep tokens
|
|
if (cum_sum > p && i >= min_keep - 1) {
|
|
last_idx = i + 1;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Resize the output vector to keep only the locally typical tokens
|
|
std::vector<llama_token_data> new_candidates;
|
|
for (size_t i = 0; i < last_idx; ++i) {
|
|
size_t idx = indices[i];
|
|
new_candidates.push_back(candidates->data[idx]);
|
|
}
|
|
|
|
// Replace the data in candidates with the new_candidates data
|
|
std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
|
|
candidates->size = new_candidates.size();
|
|
|
|
if (ctx) {
|
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
}
|
|
}
|
|
|
|
void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
|
const int64_t t_start_sample_us = ggml_time_us();
|
|
|
|
for (size_t i = 0; i < candidates_p->size; ++i) {
|
|
candidates_p->data[i].logit /= temp;
|
|
}
|
|
|
|
if (ctx) {
|
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
}
|
|
}
|
|
|
|
void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
|
|
if (last_tokens_size == 0 || penalty == 1.0f) {
|
|
return;
|
|
}
|
|
|
|
const int64_t t_start_sample_us = ggml_time_us();
|
|
|
|
for (size_t i = 0; i < candidates->size; ++i) {
|
|
const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
|
|
if (token_iter == last_tokens + last_tokens_size) {
|
|
continue;
|
|
}
|
|
|
|
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
|
|
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
|
|
if (candidates->data[i].logit <= 0) {
|
|
candidates->data[i].logit *= penalty;
|
|
} else {
|
|
candidates->data[i].logit /= penalty;
|
|
}
|
|
}
|
|
|
|
candidates->sorted = false;
|
|
|
|
if (ctx) {
|
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
}
|
|
}
|
|
|
|
void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
|
|
if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
|
|
return;
|
|
}
|
|
|
|
const int64_t t_start_sample_us = ggml_time_us();
|
|
|
|
// Create a frequency map to count occurrences of each token in last_tokens
|
|
std::unordered_map<llama_token, int> token_count;
|
|
for (size_t i = 0; i < last_tokens_size; ++i) {
|
|
token_count[last_tokens_p[i]]++;
|
|
}
|
|
|
|
// Apply frequency and presence penalties to the candidates
|
|
for (size_t i = 0; i < candidates->size; ++i) {
|
|
auto token_iter = token_count.find(candidates->data[i].id);
|
|
if (token_iter == token_count.end()) {
|
|
continue;
|
|
}
|
|
|
|
int count = token_iter->second;
|
|
candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence;
|
|
}
|
|
|
|
candidates->sorted = false;
|
|
|
|
if (ctx) {
|
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
}
|
|
}
|
|
|
|
void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
|
|
GGML_ASSERT(ctx);
|
|
const int64_t t_start_sample_us = ggml_time_us();
|
|
|
|
bool allow_eos = false;
|
|
for (const auto & stack : grammar->stacks) {
|
|
if (stack.empty()) {
|
|
allow_eos = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
const llama_token eos = llama_token_eos(ctx);
|
|
|
|
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
|
std::vector<llama_grammar_candidate> candidates_grammar;
|
|
|
|
for (size_t i = 0; i < candidates->size; ++i) {
|
|
const llama_token id = candidates->data[i].id;
|
|
const std::string piece = llama_token_to_str(ctx, id);
|
|
if (id == eos) {
|
|
if (!allow_eos) {
|
|
candidates->data[i].logit = -INFINITY;
|
|
}
|
|
} else if (piece.empty() || piece[0] == 0) {
|
|
candidates->data[i].logit = -INFINITY;
|
|
} else {
|
|
candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8));
|
|
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
|
}
|
|
}
|
|
|
|
const auto rejects = llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
|
|
for (const auto & reject : rejects) {
|
|
candidates->data[reject.index].logit = -INFINITY;
|
|
}
|
|
|
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
}
|
|
|
|
static void llama_log_softmax(float * array, size_t size) {
|
|
float max_l = *std::max_element(array, array + size);
|
|
float sum = 0.f;
|
|
for (size_t i = 0; i < size; ++i) {
|
|
float p = expf(array[i] - max_l);
|
|
sum += p;
|
|
array[i] = p;
|
|
}
|
|
|
|
for (size_t i = 0; i < size; ++i) {
|
|
array[i] = logf(array[i] / sum);
|
|
}
|
|
}
|
|
|
|
void llama_sample_classifier_free_guidance(
|
|
struct llama_context * ctx,
|
|
llama_token_data_array * candidates,
|
|
struct llama_context * guidance_ctx,
|
|
float scale) {
|
|
int64_t t_start_sample_us = ggml_time_us();
|
|
|
|
GGML_ASSERT(ctx);
|
|
|
|
auto n_vocab = llama_n_vocab(ctx);
|
|
|
|
GGML_ASSERT(n_vocab == (int)candidates->size);
|
|
GGML_ASSERT(!candidates->sorted);
|
|
|
|
std::vector<float> logits_base;
|
|
logits_base.reserve(candidates->size);
|
|
for (size_t i = 0; i < candidates->size; ++i) {
|
|
logits_base.push_back(candidates->data[i].logit);
|
|
}
|
|
llama_log_softmax(logits_base.data(), candidates->size);
|
|
|
|
float* logits_guidance = llama_get_logits(guidance_ctx);
|
|
llama_log_softmax(logits_guidance, n_vocab);
|
|
|
|
for (int i = 0; i < n_vocab; ++i) {
|
|
float logit_guidance = logits_guidance[i];
|
|
float logit_base = logits_base[i];
|
|
candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
|
|
}
|
|
|
|
if (ctx) {
|
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
}
|
|
}
|
|
|
|
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
|
|
GGML_ASSERT(ctx);
|
|
|
|
auto N = float(llama_n_vocab(ctx));
|
|
int64_t t_start_sample_us;
|
|
t_start_sample_us = ggml_time_us();
|
|
|
|
llama_sample_softmax(nullptr, candidates);
|
|
|
|
// Estimate s_hat using the most probable m tokens
|
|
float s_hat = 0.0;
|
|
float sum_ti_bi = 0.0;
|
|
float sum_ti_sq = 0.0;
|
|
for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) {
|
|
float t_i = logf(float(i + 2) / float(i + 1));
|
|
float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p);
|
|
sum_ti_bi += t_i * b_i;
|
|
sum_ti_sq += t_i * t_i;
|
|
}
|
|
s_hat = sum_ti_bi / sum_ti_sq;
|
|
|
|
// Compute k from the estimated s_hat and target surprise value
|
|
float epsilon_hat = s_hat - 1;
|
|
float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
|
|
|
|
// Sample the next word X using top-k sampling
|
|
llama_sample_top_k(nullptr, candidates, int(k), 1);
|
|
if (ctx) {
|
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
}
|
|
llama_token X = llama_sample_token(ctx, candidates);
|
|
t_start_sample_us = ggml_time_us();
|
|
|
|
// Compute error as the difference between observed surprise and target surprise value
|
|
size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
|
|
return candidate.id == X;
|
|
}));
|
|
float observed_surprise = -log2f(candidates->data[X_idx].p);
|
|
float e = observed_surprise - tau;
|
|
|
|
// Update mu using the learning rate and error
|
|
*mu = *mu - eta * e;
|
|
|
|
if (ctx) {
|
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
}
|
|
return X;
|
|
}
|
|
|
|
llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
|
|
int64_t t_start_sample_us;
|
|
t_start_sample_us = ggml_time_us();
|
|
|
|
llama_sample_softmax(ctx, candidates);
|
|
|
|
// Truncate the words with surprise values greater than mu
|
|
candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
|
|
return -log2f(candidate.p) > *mu;
|
|
}));
|
|
|
|
if (candidates->size == 0) {
|
|
candidates->size = 1;
|
|
}
|
|
|
|
if (ctx) {
|
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
}
|
|
|
|
// Normalize the probabilities of the remaining words
|
|
llama_sample_softmax(ctx, candidates);
|
|
|
|
// Sample the next word X from the remaining words
|
|
llama_token X = llama_sample_token(ctx, candidates);
|
|
t_start_sample_us = ggml_time_us();
|
|
|
|
// Compute error as the difference between observed surprise and target surprise value
|
|
size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
|
|
return candidate.id == X;
|
|
}));
|
|
float observed_surprise = -log2f(candidates->data[X_idx].p);
|
|
float e = observed_surprise - tau;
|
|
|
|
// Update mu using the learning rate and error
|
|
*mu = *mu - eta * e;
|
|
|
|
if (ctx) {
|
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
}
|
|
return X;
|
|
}
|
|
|
|
llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
|
|
const int64_t t_start_sample_us = ggml_time_us();
|
|
|
|
// Find max element
|
|
auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
|
return a.logit < b.logit;
|
|
});
|
|
|
|
llama_token result = max_iter->id;
|
|
if (ctx) {
|
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
ctx->n_sample++;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
|
|
GGML_ASSERT(ctx);
|
|
|
|
const int64_t t_start_sample_us = ggml_time_us();
|
|
llama_sample_softmax(nullptr, candidates);
|
|
|
|
std::vector<float> probs;
|
|
probs.reserve(candidates->size);
|
|
for (size_t i = 0; i < candidates->size; ++i) {
|
|
probs.push_back(candidates->data[i].p);
|
|
}
|
|
|
|
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
|
auto & rng = ctx->rng;
|
|
int idx = dist(rng);
|
|
|
|
llama_token result = candidates->data[idx].id;
|
|
|
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
ctx->n_sample++;
|
|
return result;
|
|
}
|
|
|
|
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
|
|
const int64_t t_start_sample_us = ggml_time_us();
|
|
|
|
if (token == llama_token_eos(ctx)) {
|
|
for (const auto & stack : grammar->stacks) {
|
|
if (stack.empty()) {
|
|
return;
|
|
}
|
|
}
|
|
GGML_ASSERT(false);
|
|
}
|
|
|
|
const std::string piece = llama_token_to_str(ctx, token);
|
|
|
|
// Note terminating 0 in decoded string
|
|
const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
|
|
const auto & code_points = decoded.first;
|
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
|
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
|
}
|
|
grammar->partial_utf8 = decoded.second;
|
|
GGML_ASSERT(!grammar->stacks.empty());
|
|
|
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
}
|
|
|
|
//
|
|
// Beam search
|
|
//
|
|
|
|
struct llama_beam {
|
|
std::vector<llama_token> tokens;
|
|
float p; // Cumulative beam probability (renormalized relative to all beams)
|
|
bool eob; // Initialize end-of-beam to false. Callback sets this to true.
|
|
// Sort beams by probability. In case of ties, prefer beams at eob.
|
|
bool operator<(const llama_beam & rhs) const {
|
|
return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
|
|
}
|
|
// Shift off first n tokens and discard them.
|
|
void shift_tokens(const size_t n) {
|
|
if (n) {
|
|
std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
|
|
tokens.resize(tokens.size() - n);
|
|
}
|
|
}
|
|
llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
|
|
};
|
|
|
|
// A struct for calculating logit-related info.
|
|
struct llama_logit_info {
|
|
const float * const logits;
|
|
const int n_vocab;
|
|
const float max_l;
|
|
const float normalizer;
|
|
struct sum_exp {
|
|
float max_l;
|
|
float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
|
|
};
|
|
llama_logit_info(llama_context * ctx)
|
|
: logits(llama_get_logits(ctx))
|
|
, n_vocab(llama_n_vocab(ctx))
|
|
, max_l(*std::max_element(logits, logits + n_vocab))
|
|
, normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
|
|
{ }
|
|
llama_token_data get_token_data(const llama_token token_id) const {
|
|
constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
|
|
return {token_id, logits[token_id], p};
|
|
}
|
|
// Return top k token_data by logit.
|
|
std::vector<llama_token_data> top_k(size_t k) {
|
|
std::vector<llama_token_data> min_heap; // min-heap by logit
|
|
const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
|
|
min_heap.reserve(k_min);
|
|
for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
|
|
min_heap.push_back(get_token_data(token_id));
|
|
}
|
|
auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
|
|
std::make_heap(min_heap.begin(), min_heap.end(), comp);
|
|
for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
|
|
if (min_heap.front().logit < logits[token_id]) {
|
|
std::pop_heap(min_heap.begin(), min_heap.end(), comp);
|
|
min_heap.back().id = token_id;
|
|
min_heap.back().logit = logits[token_id];
|
|
std::push_heap(min_heap.begin(), min_heap.end(), comp);
|
|
}
|
|
}
|
|
return min_heap;
|
|
}
|
|
float probability_from_logit(float logit) {
|
|
return normalizer * std::exp(logit - max_l);
|
|
}
|
|
};
|
|
|
|
struct llama_beam_search_data {
|
|
llama_context * ctx;
|
|
size_t n_beams;
|
|
int n_past;
|
|
int n_predict;
|
|
int n_threads;
|
|
std::vector<llama_beam> beams;
|
|
std::vector<llama_beam> next_beams;
|
|
|
|
// Re-calculated on each loop iteration
|
|
size_t common_prefix_length;
|
|
|
|
// Used to communicate to/from callback on beams state.
|
|
std::vector<llama_beam_view> beam_views;
|
|
|
|
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads)
|
|
: ctx(ctx)
|
|
, n_beams(n_beams)
|
|
, n_past(n_past)
|
|
, n_predict(n_predict)
|
|
, n_threads(n_threads)
|
|
, beam_views(n_beams) {
|
|
beams.reserve(n_beams);
|
|
next_beams.reserve(n_beams);
|
|
}
|
|
|
|
// Collapse beams to a single beam given by index.
|
|
void collapse_beams(const size_t beam_idx) {
|
|
if (0u < beam_idx) {
|
|
std::swap(beams[0], beams[beam_idx]);
|
|
}
|
|
beams.resize(1);
|
|
}
|
|
|
|
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
|
|
// The repetative patterns below reflect the 2 stages of heaps:
|
|
// * Gather elements until the vector is full, then call std::make_heap() on it.
|
|
// * If the heap is full and a new element is found that should be included, pop the
|
|
// least element to the back(), replace it with the new, then push it into the heap.
|
|
void fill_next_beams_by_top_probabilities(llama_beam & beam) {
|
|
// Min-heaps use a greater-than comparator.
|
|
const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
|
|
if (beam.eob) {
|
|
// beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
|
|
if (next_beams.size() < n_beams) {
|
|
next_beams.push_back(std::move(beam));
|
|
if (next_beams.size() == n_beams) {
|
|
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
|
}
|
|
} else if (next_beams.front().p < beam.p) {
|
|
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
|
next_beams.back() = std::move(beam);
|
|
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
|
}
|
|
} else {
|
|
// beam is not at end-of-sentence, so branch with next top_k tokens.
|
|
if (!beam.tokens.empty()) {
|
|
llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads);
|
|
}
|
|
llama_logit_info logit_info(ctx);
|
|
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
|
size_t i=0;
|
|
if (next_beams.size() < n_beams) {
|
|
for (; next_beams.size() < n_beams ; ++i) {
|
|
llama_beam next_beam = beam;
|
|
next_beam.tokens.push_back(next_tokens[i].id);
|
|
next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
|
next_beams.push_back(std::move(next_beam));
|
|
}
|
|
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
|
} else {
|
|
for (; next_beams.front().p == 0.0f ; ++i) {
|
|
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
|
next_beams.back() = beam;
|
|
next_beams.back().tokens.push_back(next_tokens[i].id);
|
|
next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
|
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
|
}
|
|
}
|
|
for (; i < n_beams ; ++i) {
|
|
const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
|
|
if (next_beams.front().p < next_p) {
|
|
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
|
next_beams.back() = beam;
|
|
next_beams.back().tokens.push_back(next_tokens[i].id);
|
|
next_beams.back().p = next_p;
|
|
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Find common_prefix_length based on beams.
|
|
// Requires beams is not empty.
|
|
size_t find_common_prefix_length() {
|
|
size_t common_prefix_length = beams[0].tokens.size();
|
|
for (size_t i = 1 ; i < beams.size() ; ++i) {
|
|
common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
|
|
for (size_t j = 0 ; j < common_prefix_length ; ++j) {
|
|
if (beams[0].tokens[j] != beams[i].tokens[j]) {
|
|
common_prefix_length = j;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return common_prefix_length;
|
|
}
|
|
|
|
// Construct beams_state to send back to caller via the callback function.
|
|
// Side effect: set common_prefix_length = find_common_prefix_length();
|
|
llama_beams_state get_beams_state(const bool last_call) {
|
|
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
|
beam_views[i] = beams[i].view();
|
|
}
|
|
common_prefix_length = find_common_prefix_length();
|
|
return {beam_views.data(), beams.size(), common_prefix_length, last_call};
|
|
}
|
|
|
|
// Loop:
|
|
// * while i < n_predict, AND
|
|
// * any of the beams have not yet reached end-of-beam (eob), AND
|
|
// * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
|
|
// (since all other beam probabilities can only decrease)
|
|
void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
|
|
beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
|
|
const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
|
|
for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
|
|
!beams[top_beam_index()].eob ; ++i) {
|
|
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
|
|
update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
|
|
if (common_prefix_length) {
|
|
llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads);
|
|
n_past += common_prefix_length;
|
|
}
|
|
// Zero-out next_beam probabilities to place them last in following min-heap.
|
|
std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
|
|
for (llama_beam & beam : beams) {
|
|
beam.shift_tokens(common_prefix_length);
|
|
fill_next_beams_by_top_probabilities(beam);
|
|
}
|
|
// next_beams become the beams of next/final iteration. Swap them to re-use memory.
|
|
beams.swap(next_beams);
|
|
renormalize_beam_probabilities(beams);
|
|
}
|
|
collapse_beams(top_beam_index());
|
|
callback(callback_data, get_beams_state(true));
|
|
}
|
|
|
|
// As beams grow, the cumulative probabilities decrease.
|
|
// Renormalize them to avoid floating point underflow.
|
|
static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
|
|
const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
|
|
const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
|
|
std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
|
|
}
|
|
|
|
// Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
|
|
size_t top_beam_index() {
|
|
return std::max_element(beams.begin(), beams.end()) - beams.begin();
|
|
}
|
|
|
|
// Copy (p,eob) for each beam which may have been changed by the callback.
|
|
void update_beams_from_beam_views() {
|
|
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
|
beams[i].p = beam_views[i].p;
|
|
beams[i].eob = beam_views[i].eob;
|
|
}
|
|
}
|
|
};
|
|
|
|
void llama_beam_search(llama_context * ctx,
|
|
llama_beam_search_callback_fn_t callback, void * callback_data,
|
|
size_t n_beams, int n_past, int n_predict, int n_threads) {
|
|
assert(ctx);
|
|
const int64_t t_start_sample_us = ggml_time_us();
|
|
|
|
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads);
|
|
|
|
beam_search_data.loop(callback, callback_data);
|
|
|
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
ctx->n_sample++;
|
|
}
|
|
|
|
//
|
|
// quantization
|
|
//
|
|
|
|
static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vector<float> & output, const size_t nelements, const int nthread) {
|
|
if (output.size() < nelements) {
|
|
output.resize(nelements);
|
|
}
|
|
float * f32_output = (float *) output.data();
|
|
|
|
ggml_type_traits_t qtype;
|
|
if (ggml_is_quantized(tensor->type)) {
|
|
qtype = ggml_internal_get_type_traits(tensor->type);
|
|
if (qtype.to_float == NULL) {
|
|
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
|
|
}
|
|
} else if (tensor->type != GGML_TYPE_F16) {
|
|
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
|
|
}
|
|
|
|
if (nthread < 2) {
|
|
if (tensor->type == GGML_TYPE_F16) {
|
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
|
|
} else if (ggml_is_quantized(tensor->type)) {
|
|
qtype.to_float(tensor->data, f32_output, nelements);
|
|
} else {
|
|
GGML_ASSERT(false); // unreachable
|
|
}
|
|
return;
|
|
}
|
|
|
|
auto block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
|
|
auto block_size_bytes = ggml_type_size(tensor->type);
|
|
|
|
GGML_ASSERT(nelements % block_size == 0);
|
|
auto nblocks = nelements / block_size;
|
|
auto blocks_per_thread = nblocks / nthread;
|
|
auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
|
|
|
|
std::vector<std::thread> workers;
|
|
for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
|
|
auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
|
auto thr_elems = thr_blocks * block_size; // number of elements for this thread
|
|
auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
|
|
|
|
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
|
if (typ == GGML_TYPE_F16) {
|
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
|
} else {
|
|
qtype.to_float(inbuf, outbuf, nels);
|
|
}
|
|
};
|
|
workers.push_back(std::thread(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
|
|
in_buff_offs += thr_block_bytes;
|
|
out_buff_offs += thr_elems;
|
|
}
|
|
for (auto & worker : workers) {
|
|
worker.join();
|
|
}
|
|
}
|
|
|
|
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
|
ggml_type quantized_type;
|
|
llama_ftype ftype = params->ftype;
|
|
|
|
switch (params->ftype) {
|
|
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
|
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
|
|
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
|
|
case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
|
|
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
|
|
|
|
#ifdef GGML_USE_K_QUANTS
|
|
// K-quants
|
|
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
|
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
|
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
|
case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break;
|
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
|
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
|
#endif
|
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
|
}
|
|
|
|
int nthread = params->nthread;
|
|
|
|
if (nthread <= 0) {
|
|
nthread = std::thread::hardware_concurrency();
|
|
}
|
|
|
|
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
|
|
|
|
llama_model model;
|
|
llm_load_arch(*ml, model);
|
|
llm_load_hparams(*ml, model, 0, 0, 0);
|
|
|
|
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
|
struct gguf_context * ctx_out = gguf_init_empty();
|
|
|
|
// copy the KV pairs from the input file
|
|
gguf_set_kv (ctx_out, ml->ctx_gguf);
|
|
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
|
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
|
|
|
#ifdef GGML_USE_K_QUANTS
|
|
int n_attention_wv = 0;
|
|
int n_feed_forward_w2 = 0;
|
|
|
|
for (int i = 0; i < ml->n_tensors; ++i) {
|
|
struct ggml_tensor * meta = ml->get_tensor_meta(i);
|
|
|
|
const std::string name = ggml_get_name(meta);
|
|
|
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
|
if (name.find("attn_v.weight") != std::string::npos) {
|
|
++n_attention_wv;
|
|
}
|
|
else if (name.find("ffn_down.weight") != std::string::npos) {
|
|
++n_feed_forward_w2;
|
|
}
|
|
}
|
|
if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
|
|
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
|
|
__func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
|
|
}
|
|
|
|
int i_attention_wv = 0;
|
|
int i_feed_forward_w2 = 0;
|
|
#endif
|
|
|
|
size_t total_size_org = 0;
|
|
size_t total_size_new = 0;
|
|
std::vector<int64_t> hist_all(1 << 4, 0);
|
|
|
|
std::vector<std::thread> workers;
|
|
std::mutex mutex;
|
|
|
|
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
|
|
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
|
};
|
|
|
|
int idx = 0;
|
|
|
|
std::vector<uint8_t> read_data;
|
|
std::vector<uint8_t> work;
|
|
|
|
// populate the original tensors so we get an initial meta data
|
|
for (int i = 0; i < ml->n_tensors; ++i) {
|
|
struct ggml_tensor * meta = ml->get_tensor_meta(i);
|
|
gguf_add_tensor(ctx_out, meta);
|
|
}
|
|
|
|
std::ofstream fout(fname_out, std::ios::binary);
|
|
|
|
const size_t meta_size = gguf_get_meta_size(ctx_out);
|
|
|
|
LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size);
|
|
|
|
// placeholder for the meta data
|
|
::zeros(fout, meta_size);
|
|
|
|
for (int i = 0; i < ml->n_tensors; ++i) {
|
|
struct ggml_tensor * tensor = ml->get_tensor_meta(i);
|
|
|
|
const std::string name = ggml_get_name(tensor);
|
|
|
|
read_data.resize(ggml_nbytes(tensor));
|
|
tensor->data = read_data.data();
|
|
ml->load_data_for(tensor);
|
|
|
|
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
|
|
++idx, ml->n_tensors,
|
|
ggml_get_name(tensor),
|
|
llama_format_tensor_shape(tensor).c_str(),
|
|
ggml_type_name(tensor->type));
|
|
|
|
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
|
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
|
|
|
|
// quantize only 2D tensors
|
|
quantize &= (tensor->n_dims == 2);
|
|
quantize &= params->quantize_output_tensor || name != "output.weight";
|
|
quantize &= quantized_type != tensor->type;
|
|
|
|
enum ggml_type new_type;
|
|
void * new_data;
|
|
size_t new_size;
|
|
|
|
if (!quantize) {
|
|
new_type = tensor->type;
|
|
new_data = tensor->data;
|
|
new_size = ggml_nbytes(tensor);
|
|
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
|
} else {
|
|
new_type = quantized_type;
|
|
#ifdef GGML_USE_K_QUANTS
|
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
|
const auto tn = LLM_TN(ml->get_arch());
|
|
|
|
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
|
int nx = tensor->ne[0];
|
|
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
|
new_type = GGML_TYPE_Q8_0;
|
|
}
|
|
else if (new_type != GGML_TYPE_Q8_0) {
|
|
new_type = GGML_TYPE_Q6_K;
|
|
}
|
|
} else if (name.find("attn_v.weight") != std::string::npos) {
|
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
|
new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
|
}
|
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
|
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
|
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
|
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
|
if (model.type == MODEL_70B) {
|
|
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
|
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
|
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
|
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
|
}
|
|
++i_attention_wv;
|
|
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
|
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
|
: model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
|
: GGML_TYPE_Q3_K;
|
|
}
|
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
|
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
|
}
|
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
|
if (model.arch == LLM_ARCH_FALCON) {
|
|
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
|
use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
|
} else {
|
|
if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
|
}
|
|
}
|
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
|
|
new_type = GGML_TYPE_Q5_K;
|
|
}
|
|
++i_feed_forward_w2;
|
|
} else if (name.find("attn_output.weight") != std::string::npos) {
|
|
if (model.arch != LLM_ARCH_FALCON) {
|
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
|
} else {
|
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
|
}
|
|
}
|
|
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
|
}
|
|
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
|
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
|
}
|
|
// This can be used to reduce the size of the Q5_K_S model.
|
|
// The associated PPL increase is fully in line with the size reduction
|
|
//else {
|
|
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
|
//}
|
|
bool convert_incompatible_tensor = false;
|
|
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
|
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
|
int nx = tensor->ne[0];
|
|
int ny = tensor->ne[1];
|
|
if (nx % QK_K != 0) {
|
|
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
|
|
convert_incompatible_tensor = true;
|
|
}
|
|
}
|
|
if (convert_incompatible_tensor) {
|
|
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
|
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
|
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
|
|
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
|
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
|
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
|
|
} else {
|
|
throw std::runtime_error("Unsupported tensor size encountered\n");
|
|
}
|
|
}
|
|
#endif
|
|
|
|
const size_t nelements = ggml_nelements(tensor);
|
|
|
|
float * f32_data;
|
|
std::vector<float> f32_conv_buf;
|
|
|
|
if (tensor->type == GGML_TYPE_F32) {
|
|
f32_data = (float *) tensor->data;
|
|
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
|
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
|
} else {
|
|
llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
|
|
f32_data = (float *) f32_conv_buf.data();
|
|
}
|
|
|
|
LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
|
|
fflush(stdout);
|
|
|
|
work.resize(nelements * 4); // upper bound on size
|
|
new_data = work.data();
|
|
std::vector<int64_t> hist_cur(1 << 4, 0);
|
|
|
|
static const int chunk_size = 32 * 512;
|
|
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
|
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
|
if (nthread_use < 2) {
|
|
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
|
|
} else {
|
|
size_t counter = 0;
|
|
new_size = 0;
|
|
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
|
|
std::vector<int64_t> local_hist;
|
|
size_t local_size = 0;
|
|
while (true) {
|
|
std::unique_lock<std::mutex> lock(mutex);
|
|
size_t first = counter; counter += chunk_size;
|
|
if (first >= nelements) {
|
|
if (!local_hist.empty()) {
|
|
for (int j=0; j<int(local_hist.size()); ++j) {
|
|
hist_cur[j] += local_hist[j];
|
|
}
|
|
new_size += local_size;
|
|
}
|
|
break;
|
|
}
|
|
lock.unlock();
|
|
size_t last = std::min(nelements, first + chunk_size);
|
|
if (local_hist.empty()) {
|
|
local_hist.resize(hist_cur.size(), 0);
|
|
}
|
|
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
|
}
|
|
};
|
|
if ((int) workers.size() < nthread_use - 1) {
|
|
workers.resize(nthread_use - 1);
|
|
}
|
|
for (int it = 0; it < nthread_use - 1; ++it) {
|
|
workers[it] = std::thread(compute);
|
|
}
|
|
compute();
|
|
for (int it = 0; it < nthread_use - 1; ++it) {
|
|
workers[it].join();
|
|
}
|
|
}
|
|
|
|
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
|
int64_t tot_count = 0;
|
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
|
hist_all[i] += hist_cur[i];
|
|
tot_count += hist_cur[i];
|
|
}
|
|
|
|
if (tot_count > 0) {
|
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
|
LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
|
|
}
|
|
}
|
|
LLAMA_LOG_INFO("\n");
|
|
}
|
|
total_size_org += ggml_nbytes(tensor);
|
|
total_size_new += new_size;
|
|
|
|
// update the gguf meta data as we go
|
|
gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
|
|
gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
|
|
|
|
// write tensor data + padding
|
|
fout.write((const char *) new_data, new_size);
|
|
zeros(fout, GGML_PAD(new_size, align) - new_size);
|
|
}
|
|
|
|
// go back to beginning of file and write the updated meta data
|
|
{
|
|
fout.seekp(0);
|
|
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
|
|
gguf_get_meta_data(ctx_out, data.data());
|
|
fout.write((const char *) data.data(), data.size());
|
|
}
|
|
|
|
fout.close();
|
|
|
|
gguf_free(ctx_out);
|
|
|
|
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
|
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
|
|
|
|
// print histogram for all tensors
|
|
{
|
|
int64_t sum_all = 0;
|
|
for (size_t i = 0; i < hist_all.size(); i++) {
|
|
sum_all += hist_all[i];
|
|
}
|
|
|
|
if (sum_all > 0) {
|
|
LLAMA_LOG_INFO("%s: hist: ", __func__);
|
|
for (size_t i = 0; i < hist_all.size(); i++) {
|
|
LLAMA_LOG_INFO("%5.3f ", hist_all[i] / float(sum_all));
|
|
}
|
|
LLAMA_LOG_INFO("\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
// TODO: after the GGUF PR, this likely won't work and needs to be updated
|
|
int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
|
|
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
|
|
|
const int64_t t_start_lora_us = ggml_time_us();
|
|
|
|
auto fin = std::ifstream(path_lora, std::ios::binary);
|
|
if (!fin) {
|
|
LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora);
|
|
return 1;
|
|
}
|
|
|
|
// verify magic and version
|
|
{
|
|
uint32_t magic;
|
|
fin.read((char *) &magic, sizeof(magic));
|
|
uint32_t format_version;
|
|
fin.read((char *) &format_version, sizeof(format_version));
|
|
|
|
if (format_version != 1) {
|
|
LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
int32_t lora_r;
|
|
int32_t lora_alpha;
|
|
fin.read((char *) &lora_r, sizeof(lora_r));
|
|
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
|
float scaling = (float)lora_alpha / (float)lora_r;
|
|
|
|
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
|
|
|
// create a temporary ggml context to store the lora tensors
|
|
// todo: calculate size from biggest possible tensor
|
|
std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
|
|
struct ggml_init_params params;
|
|
params.mem_size = lora_buf.size();
|
|
params.mem_buffer = lora_buf.data();
|
|
params.no_alloc = false;
|
|
|
|
ggml_context * lora_ctx = ggml_init(params);
|
|
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
|
|
|
|
// create a name -> tensor map of the model to accelerate lookups
|
|
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
|
for (const auto & kv : model.tensors_by_name) {
|
|
model_tensors.insert(kv);
|
|
}
|
|
|
|
// load base model
|
|
std::unique_ptr<llama_model_loader> ml;
|
|
ggml_context * base_ctx = NULL;
|
|
std::vector<uint8_t> base_buf;
|
|
if (path_base_model) {
|
|
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
|
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
|
|
|
|
size_t ctx_size;
|
|
size_t mmapped_size;
|
|
ml->calc_sizes(ctx_size, mmapped_size);
|
|
base_buf.resize(ctx_size);
|
|
|
|
ggml_init_params base_params;
|
|
base_params.mem_size = base_buf.size();
|
|
base_params.mem_buffer = base_buf.data();
|
|
base_params.no_alloc = ml->use_mmap;
|
|
|
|
base_ctx = ggml_init(base_params);
|
|
|
|
// maybe this should in llama_model_loader
|
|
if (ml->use_mmap) {
|
|
ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa()));
|
|
}
|
|
}
|
|
|
|
// read tensors and apply
|
|
bool warned = false;
|
|
int n_tensors = 0;
|
|
|
|
std::vector<uint8_t> work_buffer;
|
|
|
|
while (true) {
|
|
int32_t n_dims;
|
|
int32_t length;
|
|
int32_t ftype;
|
|
|
|
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
|
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
|
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
|
if (fin.eof()) {
|
|
break;
|
|
}
|
|
|
|
int32_t ne[2] = { 1, 1 };
|
|
for (int i = 0; i < n_dims; ++i) {
|
|
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
|
}
|
|
|
|
std::string name;
|
|
{
|
|
char buf[1024];
|
|
fin.read(buf, length);
|
|
name = std::string(buf, length);
|
|
}
|
|
|
|
// check for lora suffix and get the type of tensor
|
|
const std::string lora_suffix = ".lora";
|
|
size_t pos = name.rfind(lora_suffix);
|
|
if (pos == std::string::npos) {
|
|
LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
|
|
return 1;
|
|
}
|
|
|
|
std::string lora_type = name.substr(pos + lora_suffix.length());
|
|
std::string base_name = name;
|
|
base_name.erase(pos);
|
|
// LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
|
|
|
if (model_tensors.find(base_name) == model_tensors.end()) {
|
|
LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
|
return 1;
|
|
}
|
|
|
|
// create ggml tensor
|
|
ggml_type wtype;
|
|
switch (ftype) {
|
|
case 0: wtype = GGML_TYPE_F32; break;
|
|
case 1: wtype = GGML_TYPE_F16; break;
|
|
default:
|
|
{
|
|
LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
|
|
__func__, ftype);
|
|
return false;
|
|
}
|
|
}
|
|
ggml_tensor * lora_tensor;
|
|
if (n_dims == 2) {
|
|
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
|
}
|
|
else {
|
|
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
|
return 1;
|
|
}
|
|
ggml_set_name(lora_tensor, "lora_tensor");
|
|
|
|
// load tensor data
|
|
size_t offset = fin.tellg();
|
|
size_t tensor_data_size = ggml_nbytes(lora_tensor);
|
|
offset = (offset + 31) & -32;
|
|
fin.seekg(offset);
|
|
fin.read((char*)lora_tensor->data, tensor_data_size);
|
|
|
|
lora_tensors[name] = lora_tensor;
|
|
|
|
// check if we have both A and B tensors and apply
|
|
if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
|
|
lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
|
|
|
|
ggml_tensor * dest_t = model_tensors[base_name];
|
|
|
|
offload_func_t offload_func = llama_nop;
|
|
offload_func_t offload_func_force_inplace = llama_nop;
|
|
|
|
#ifdef GGML_USE_CUBLAS
|
|
if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
|
|
if (dest_t->type != GGML_TYPE_F16) {
|
|
throw std::runtime_error(format(
|
|
"%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__));
|
|
}
|
|
offload_func = ggml_cuda_assign_buffers;
|
|
offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
|
|
}
|
|
#endif // GGML_USE_CUBLAS
|
|
|
|
ggml_tensor * base_t;
|
|
if (ml) {
|
|
struct gguf_context * ctx_gguf = ml->ctx_gguf;
|
|
|
|
// load from base model
|
|
if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) {
|
|
// TODO: throw
|
|
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
|
return 1;
|
|
}
|
|
|
|
// TODO: not tested!! maybe not working!
|
|
base_t = ml->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
|
|
ml->load_data_for(base_t);
|
|
} else {
|
|
base_t = dest_t;
|
|
}
|
|
|
|
if (ggml_is_quantized(base_t->type)) {
|
|
if (!warned) {
|
|
LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
|
"use a f16 or f32 base model with --lora-base\n", __func__);
|
|
warned = true;
|
|
}
|
|
}
|
|
|
|
ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
|
|
GGML_ASSERT(loraA->type == GGML_TYPE_F32);
|
|
ggml_set_name(loraA, "loraA");
|
|
|
|
ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
|
|
GGML_ASSERT(loraB->type == GGML_TYPE_F32);
|
|
ggml_set_name(loraB, "loraB");
|
|
|
|
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
|
LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
|
" are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
|
|
return 1;
|
|
}
|
|
|
|
// w = w + BA*s
|
|
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
|
offload_func(BA);
|
|
ggml_set_name(BA, "BA");
|
|
|
|
if (scaling != 1.0f) {
|
|
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
|
ggml_set_name(scale_tensor, "scale_tensor");
|
|
|
|
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
|
|
offload_func(BA);
|
|
ggml_set_name(BA, "BA_scaled");
|
|
}
|
|
|
|
ggml_tensor * r;
|
|
if (base_t == dest_t) {
|
|
r = ggml_add_inplace(lora_ctx, dest_t, BA);
|
|
offload_func_force_inplace(r);
|
|
ggml_set_name(r, "r_add_inplace");
|
|
}
|
|
else {
|
|
r = ggml_add(lora_ctx, base_t, BA);
|
|
offload_func(r);
|
|
ggml_set_name(r, "r_add");
|
|
|
|
r = ggml_cpy(lora_ctx, r, dest_t);
|
|
offload_func(r);
|
|
ggml_set_name(r, "r_cpy");
|
|
}
|
|
|
|
struct ggml_cgraph gf = ggml_build_forward(r);
|
|
|
|
ggml_graph_compute_helper(work_buffer, &gf, n_threads);
|
|
|
|
// we won't need these tensors again, reset the context to save memory
|
|
ggml_free(lora_ctx);
|
|
lora_ctx = ggml_init(params);
|
|
lora_tensors.clear();
|
|
|
|
n_tensors++;
|
|
if (n_tensors % 4 == 0) {
|
|
LLAMA_LOG_INFO(".");
|
|
}
|
|
}
|
|
}
|
|
|
|
// TODO: this should be in a destructor, it will leak on failure
|
|
ggml_free(lora_ctx);
|
|
if (base_ctx) {
|
|
ggml_free(base_ctx);
|
|
}
|
|
|
|
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
|
LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
|
|
|
|
return 0;
|
|
}
|
|
|
|
//
|
|
// interface implementation
|
|
//
|
|
|
|
struct llama_context_params llama_context_default_params() {
|
|
struct llama_context_params result = {
|
|
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
|
/*.n_ctx =*/ 512,
|
|
/*.n_batch =*/ 512,
|
|
/*.gpu_layers =*/ 0,
|
|
/*.main_gpu =*/ 0,
|
|
/*.tensor_split =*/ nullptr,
|
|
/*.rope_freq_base =*/ 10000.0f,
|
|
/*.rope_freq_scale =*/ 1.0f,
|
|
/*.progress_callback =*/ nullptr,
|
|
/*.progress_callback_user_data =*/ nullptr,
|
|
/*.low_vram =*/ false,
|
|
/*.mul_mat_q =*/ false,
|
|
/*.f16_kv =*/ true,
|
|
/*.logits_all =*/ false,
|
|
/*.vocab_only =*/ false,
|
|
/*.use_mmap =*/ true,
|
|
/*.use_mlock =*/ false,
|
|
/*.embedding =*/ false,
|
|
};
|
|
|
|
return result;
|
|
}
|
|
|
|
struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
struct llama_model_quantize_params result = {
|
|
/*.nthread =*/ 0,
|
|
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
|
/*.allow_requantize =*/ false,
|
|
/*.quantize_output_tensor =*/ true,
|
|
};
|
|
|
|
return result;
|
|
}
|
|
|
|
int llama_max_devices(void) {
|
|
return LLAMA_MAX_DEVICES;
|
|
}
|
|
|
|
bool llama_mmap_supported(void) {
|
|
return llama_mmap::SUPPORTED;
|
|
}
|
|
|
|
bool llama_mlock_supported(void) {
|
|
return llama_mlock::SUPPORTED;
|
|
}
|
|
|
|
void llama_backend_init(bool numa) {
|
|
ggml_time_init();
|
|
|
|
// needed to initialize f16 tables
|
|
{
|
|
struct ggml_init_params params = { 0, NULL, false };
|
|
struct ggml_context * ctx = ggml_init(params);
|
|
ggml_free(ctx);
|
|
}
|
|
|
|
if (numa) {
|
|
ggml_numa_init();
|
|
}
|
|
|
|
#ifdef GGML_USE_MPI
|
|
ggml_mpi_backend_init();
|
|
#endif
|
|
}
|
|
|
|
void llama_backend_free(void) {
|
|
#ifdef GGML_USE_MPI
|
|
ggml_mpi_backend_free();
|
|
#endif
|
|
}
|
|
|
|
int64_t llama_time_us(void) {
|
|
return ggml_time_us();
|
|
}
|
|
|
|
struct llama_model * llama_load_model_from_file(
|
|
const char * path_model,
|
|
struct llama_context_params params) {
|
|
ggml_time_init();
|
|
|
|
llama_model * model = new llama_model;
|
|
|
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
|
|
|
unsigned cur_percentage = 0;
|
|
if (params.progress_callback == NULL) {
|
|
params.progress_callback_user_data = &cur_percentage;
|
|
params.progress_callback = [](float progress, void * ctx) {
|
|
unsigned * cur_percentage_p = (unsigned *) ctx;
|
|
unsigned percentage = (unsigned) (100 * progress);
|
|
while (percentage > *cur_percentage_p) {
|
|
*cur_percentage_p = percentage;
|
|
LLAMA_LOG_INFO(".");
|
|
if (percentage >= 100) {
|
|
LLAMA_LOG_INFO("\n");
|
|
}
|
|
}
|
|
};
|
|
}
|
|
|
|
if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
|
params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,
|
|
params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
|
|
params.progress_callback, params.progress_callback_user_data)) {
|
|
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
|
|
delete model;
|
|
return nullptr;
|
|
}
|
|
|
|
return model;
|
|
}
|
|
|
|
void llama_free_model(struct llama_model * model) {
|
|
delete model;
|
|
}
|
|
|
|
struct llama_context * llama_new_context_with_model(
|
|
struct llama_model * model,
|
|
struct llama_context_params params) {
|
|
|
|
if (!model) {
|
|
return nullptr;
|
|
}
|
|
|
|
llama_context * ctx = new llama_context(*model);
|
|
|
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
|
params.seed = time(NULL);
|
|
}
|
|
|
|
ctx->rng = std::mt19937(params.seed);
|
|
ctx->logits_all = params.logits_all;
|
|
|
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
|
|
|
// reserve memory for context buffers
|
|
if (!params.vocab_only) {
|
|
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
|
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
|
llama_free(ctx);
|
|
return nullptr;
|
|
}
|
|
|
|
{
|
|
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
|
LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
|
}
|
|
|
|
const auto & hparams = ctx->model.hparams;
|
|
|
|
// resized during inference
|
|
if (params.logits_all) {
|
|
ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
|
|
} else {
|
|
ctx->logits.reserve(hparams.n_vocab);
|
|
}
|
|
|
|
if (params.embedding){
|
|
ctx->embedding.resize(hparams.n_embd);
|
|
}
|
|
|
|
{
|
|
static const size_t tensor_alignment = 32;
|
|
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
|
|
ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
|
|
|
|
// create measure allocator
|
|
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
|
|
|
|
// build worst-case graph
|
|
int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
|
|
int n_past = hparams.n_ctx - n_tokens;
|
|
llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
|
ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
|
|
#ifdef GGML_USE_METAL
|
|
if (params.n_gpu_layers > 0) {
|
|
ctx->ctx_metal = ggml_metal_init(1);
|
|
if (!ctx->ctx_metal) {
|
|
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
|
llama_free(ctx);
|
|
return NULL;
|
|
}
|
|
ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
|
|
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
|
}
|
|
#endif
|
|
// measure memory requirements for the graph
|
|
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
|
|
|
LLAMA_LOG_INFO("%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
|
|
|
// recreate allocator with exact memory requirements
|
|
ggml_allocr_free(ctx->alloc);
|
|
|
|
ctx->buf_alloc.resize(alloc_size);
|
|
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment);
|
|
#ifdef GGML_USE_METAL
|
|
if (ctx->ctx_metal) {
|
|
ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
|
}
|
|
#endif
|
|
#ifdef GGML_USE_CUBLAS
|
|
if (params.low_vram) {
|
|
LLAMA_LOG_INFO("%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
|
|
ggml_cuda_set_scratch_size(0); // disable scratch
|
|
} else {
|
|
ggml_cuda_set_scratch_size(alloc_size);
|
|
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
|
|
#ifdef GGML_USE_METAL
|
|
if (params.n_gpu_layers > 0) {
|
|
// this allocates all Metal resources and memory buffers
|
|
|
|
void * data_ptr = NULL;
|
|
size_t data_size = 0;
|
|
|
|
if (params.use_mmap) {
|
|
data_ptr = ctx->model.mapping->addr;
|
|
data_size = ctx->model.mapping->size;
|
|
} else {
|
|
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
|
data_size = ggml_get_mem_size (ctx->model.ctx);
|
|
}
|
|
|
|
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
|
|
|
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
|
|
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
|
if (!(result)) { \
|
|
LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
|
|
llama_free(ctx); \
|
|
return NULL; \
|
|
}
|
|
|
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
|
|
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
|
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
|
|
|
|
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
|
|
#undef LLAMA_METAL_CHECK_BUF
|
|
}
|
|
#endif
|
|
|
|
#ifdef GGML_USE_MPI
|
|
ctx->ctx_mpi = ggml_mpi_init();
|
|
|
|
if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
|
|
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
|
|
const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
|
|
while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
|
|
llama_backend_free();
|
|
exit(1);
|
|
}
|
|
#endif
|
|
|
|
return ctx;
|
|
}
|
|
|
|
struct llama_context * llama_init_from_file(
|
|
const char * path_model,
|
|
struct llama_context_params params) {
|
|
struct llama_model * model = llama_load_model_from_file(path_model, params);
|
|
if (!model) {
|
|
return nullptr;
|
|
}
|
|
|
|
struct llama_context * ctx = llama_new_context_with_model(model, params);
|
|
ctx->model_owner = true;
|
|
|
|
return ctx;
|
|
}
|
|
|
|
void llama_free(struct llama_context * ctx) {
|
|
delete ctx;
|
|
}
|
|
|
|
int llama_n_vocab(const struct llama_context * ctx) {
|
|
return ctx->model.vocab.id_to_token.size();
|
|
}
|
|
|
|
int llama_n_ctx(const struct llama_context * ctx) {
|
|
return ctx->model.hparams.n_ctx;
|
|
}
|
|
|
|
int llama_n_embd(const struct llama_context * ctx) {
|
|
return ctx->model.hparams.n_embd;
|
|
}
|
|
|
|
enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) {
|
|
return ctx->model.vocab.type;
|
|
}
|
|
|
|
int llama_model_n_vocab(const struct llama_model * model) {
|
|
return model->vocab.id_to_token.size();
|
|
}
|
|
|
|
int llama_model_n_ctx(const struct llama_model * model) {
|
|
return model->hparams.n_ctx;
|
|
}
|
|
|
|
int llama_model_n_embd(const struct llama_model * model) {
|
|
return model->hparams.n_embd;
|
|
}
|
|
|
|
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
|
return snprintf(buf, buf_size, "%s %s %s",
|
|
model->name.c_str(),
|
|
llama_model_type_name(model->type),
|
|
llama_model_ftype_name(model->ftype).c_str());
|
|
}
|
|
|
|
uint64_t llama_model_size(const struct llama_model * model) {
|
|
uint64_t size = 0;
|
|
for (const auto & it : model->tensors_by_name) {
|
|
size += ggml_nbytes(it.second);
|
|
}
|
|
return size;
|
|
}
|
|
|
|
uint64_t llama_model_n_params(const struct llama_model * model) {
|
|
uint64_t nparams = 0;
|
|
for (const auto & it : model->tensors_by_name) {
|
|
nparams += ggml_nelements(it.second);
|
|
}
|
|
return nparams;
|
|
}
|
|
|
|
int llama_model_quantize(
|
|
const char * fname_inp,
|
|
const char * fname_out,
|
|
const llama_model_quantize_params * params) {
|
|
try {
|
|
llama_model_quantize_internal(fname_inp, fname_out, params);
|
|
return 0;
|
|
} catch (const std::exception & err) {
|
|
LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
|
try {
|
|
return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
|
|
} catch (const std::exception & err) {
|
|
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
|
|
try {
|
|
return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
|
|
} catch (const std::exception & err) {
|
|
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
|
return ctx->kv_self.n;
|
|
}
|
|
|
|
#define LLAMA_MAX_RNG_STATE (64*1024)
|
|
|
|
void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
|
|
if (seed == LLAMA_DEFAULT_SEED) {
|
|
seed = time(NULL);
|
|
}
|
|
ctx->rng.seed(seed);
|
|
}
|
|
|
|
// Returns the *maximum* size of the state
|
|
size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
|
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
|
const size_t s_rng_size = sizeof(size_t);
|
|
const size_t s_rng = LLAMA_MAX_RNG_STATE;
|
|
const size_t s_logits_capacity = sizeof(size_t);
|
|
const size_t s_logits_size = sizeof(size_t);
|
|
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
|
|
const size_t s_embedding_size = sizeof(size_t);
|
|
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
|
const size_t s_kv_size = sizeof(size_t);
|
|
const size_t s_kv_ntok = sizeof(int);
|
|
const size_t s_kv = ctx->kv_self.buf.size;
|
|
|
|
const size_t s_total = (
|
|
+ s_rng_size
|
|
+ s_rng
|
|
+ s_logits_capacity
|
|
+ s_logits_size
|
|
+ s_logits
|
|
+ s_embedding_size
|
|
+ s_embedding
|
|
+ s_kv_size
|
|
+ s_kv_ntok
|
|
+ s_kv
|
|
);
|
|
|
|
return s_total;
|
|
}
|
|
|
|
// llama_context_data
|
|
struct llama_data_context {
|
|
virtual void write(const void * src, size_t size) = 0;
|
|
virtual size_t get_size_written() = 0;
|
|
virtual ~llama_data_context() = default;
|
|
};
|
|
|
|
struct llama_data_buffer_context : llama_data_context {
|
|
uint8_t * ptr;
|
|
size_t size_written = 0;
|
|
|
|
llama_data_buffer_context(uint8_t * p) : ptr(p) {}
|
|
|
|
void write(const void * src, size_t size) override {
|
|
memcpy(ptr, src, size);
|
|
ptr += size;
|
|
size_written += size;
|
|
}
|
|
|
|
size_t get_size_written() override {
|
|
return size_written;
|
|
}
|
|
};
|
|
|
|
struct llama_data_file_context : llama_data_context {
|
|
llama_file * file;
|
|
size_t size_written = 0;
|
|
|
|
llama_data_file_context(llama_file * f) : file(f) {}
|
|
|
|
void write(const void * src, size_t size) override {
|
|
file->write_raw(src, size);
|
|
size_written += size;
|
|
}
|
|
|
|
size_t get_size_written() override {
|
|
return size_written;
|
|
}
|
|
};
|
|
|
|
/** copy state data into either a buffer or file depending on the passed in context
|
|
*
|
|
* file context:
|
|
* llama_file file("/path", "wb");
|
|
* llama_data_file_context data_ctx(&file);
|
|
* llama_copy_state_data(ctx, &data_ctx);
|
|
*
|
|
* buffer context:
|
|
* std::vector<uint8_t> buf(max_size, 0);
|
|
* llama_data_buffer_context data_ctx(&buf.data());
|
|
* llama_copy_state_data(ctx, &data_ctx);
|
|
*
|
|
*/
|
|
void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
|
// copy rng
|
|
{
|
|
std::stringstream rng_ss;
|
|
rng_ss << ctx->rng;
|
|
|
|
const size_t rng_size = rng_ss.str().size();
|
|
char rng_buf[LLAMA_MAX_RNG_STATE];
|
|
|
|
memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
|
|
memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
|
|
|
|
data_ctx->write(&rng_size, sizeof(rng_size));
|
|
data_ctx->write(&rng_buf[0], LLAMA_MAX_RNG_STATE);
|
|
}
|
|
|
|
// copy logits
|
|
{
|
|
const size_t logits_cap = ctx->logits.capacity();
|
|
const size_t logits_size = ctx->logits.size();
|
|
|
|
data_ctx->write(&logits_cap, sizeof(logits_cap));
|
|
data_ctx->write(&logits_size, sizeof(logits_size));
|
|
|
|
if (logits_size) {
|
|
data_ctx->write(ctx->logits.data(), logits_size * sizeof(float));
|
|
}
|
|
|
|
// If there is a gap between the size and the capacity, write padding
|
|
size_t padding_size = (logits_cap - logits_size) * sizeof(float);
|
|
if (padding_size > 0) {
|
|
std::vector<uint8_t> padding(padding_size, 0); // Create a buffer filled with zeros
|
|
data_ctx->write(padding.data(), padding_size);
|
|
}
|
|
}
|
|
|
|
// copy embeddings
|
|
{
|
|
const size_t embedding_size = ctx->embedding.size();
|
|
|
|
data_ctx->write(&embedding_size, sizeof(embedding_size));
|
|
|
|
if (embedding_size) {
|
|
data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float));
|
|
}
|
|
}
|
|
|
|
// copy kv cache
|
|
{
|
|
const auto & kv_self = ctx->kv_self;
|
|
const auto & hparams = ctx->model.hparams;
|
|
const int n_layer = hparams.n_layer;
|
|
const int n_embd = hparams.n_embd_gqa();
|
|
const int n_ctx = hparams.n_ctx;
|
|
|
|
const size_t kv_size = kv_self.buf.size;
|
|
const int kv_ntok = llama_get_kv_cache_token_count(ctx);
|
|
|
|
data_ctx->write(&kv_size, sizeof(kv_size));
|
|
data_ctx->write(&kv_ntok, sizeof(kv_ntok));
|
|
|
|
if (kv_size) {
|
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
|
|
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
|
ggml_cgraph gf{};
|
|
|
|
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
|
std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
|
|
kout3d->data = kout3d_data.data();
|
|
|
|
ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
|
|
std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
|
|
vout3d->data = vout3d_data.data();
|
|
|
|
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
|
n_embd, kv_ntok, n_layer,
|
|
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
|
|
|
|
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
|
|
kv_ntok, n_embd, n_layer,
|
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
|
|
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
|
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
|
|
|
ggml_free(cpy_ctx);
|
|
|
|
// our data is now in the kout3d_data and vout3d_data buffers
|
|
// write them to file
|
|
data_ctx->write(kout3d_data.data(), kout3d_data.size());
|
|
data_ctx->write(vout3d_data.data(), vout3d_data.size());
|
|
}
|
|
}
|
|
}
|
|
|
|
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
llama_data_buffer_context data_ctx(dst);
|
|
llama_copy_state_data_internal(ctx, &data_ctx);
|
|
|
|
return data_ctx.get_size_written();
|
|
}
|
|
|
|
// Sets the state reading from the specified source address
|
|
size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
uint8_t * inp = src;
|
|
|
|
// set rng
|
|
{
|
|
size_t rng_size;
|
|
char rng_buf[LLAMA_MAX_RNG_STATE];
|
|
|
|
memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
|
|
memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
|
|
|
|
std::stringstream rng_ss;
|
|
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
|
rng_ss >> ctx->rng;
|
|
|
|
GGML_ASSERT(rng_ss.fail() == false);
|
|
}
|
|
|
|
// set logits
|
|
{
|
|
size_t logits_cap;
|
|
size_t logits_size;
|
|
|
|
memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap);
|
|
memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
|
|
|
|
GGML_ASSERT(ctx->logits.capacity() == logits_cap);
|
|
|
|
if (logits_size) {
|
|
ctx->logits.resize(logits_size);
|
|
memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
|
|
}
|
|
|
|
inp += logits_cap * sizeof(float);
|
|
}
|
|
|
|
// set embeddings
|
|
{
|
|
size_t embedding_size;
|
|
|
|
memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
|
|
|
|
GGML_ASSERT(ctx->embedding.capacity() == embedding_size);
|
|
|
|
if (embedding_size) {
|
|
memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
|
|
inp += embedding_size * sizeof(float);
|
|
}
|
|
}
|
|
|
|
// set kv cache
|
|
{
|
|
const auto & kv_self = ctx->kv_self;
|
|
const auto & hparams = ctx->model.hparams;
|
|
const int n_layer = hparams.n_layer;
|
|
const int n_embd = hparams.n_embd_gqa();
|
|
const int n_ctx = hparams.n_ctx;
|
|
|
|
size_t kv_size;
|
|
int kv_ntok;
|
|
|
|
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
|
memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok);
|
|
|
|
if (kv_size) {
|
|
GGML_ASSERT(kv_self.buf.size == kv_size);
|
|
|
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
|
|
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
|
ggml_cgraph gf{};
|
|
|
|
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
|
kin3d->data = (void *) inp;
|
|
inp += ggml_nbytes(kin3d);
|
|
|
|
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
|
|
vin3d->data = (void *) inp;
|
|
inp += ggml_nbytes(vin3d);
|
|
|
|
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
|
n_embd, kv_ntok, n_layer,
|
|
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
|
|
|
|
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
|
|
kv_ntok, n_embd, n_layer,
|
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
|
|
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
|
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
|
|
|
ggml_free(cpy_ctx);
|
|
}
|
|
|
|
ctx->kv_self.n = kv_ntok;
|
|
}
|
|
|
|
const size_t nread = inp - src;
|
|
const size_t max_size = llama_get_state_size(ctx);
|
|
|
|
GGML_ASSERT(nread <= max_size);
|
|
|
|
return nread;
|
|
}
|
|
|
|
static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
|
llama_file file(path_session, "rb");
|
|
|
|
// sanity checks
|
|
{
|
|
const uint32_t magic = file.read_u32();
|
|
const uint32_t version = file.read_u32();
|
|
|
|
if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
|
|
LLAMA_LOG_ERROR("%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
|
|
return false;
|
|
}
|
|
|
|
llama_hparams session_hparams;
|
|
file.read_raw(&session_hparams, sizeof(llama_hparams));
|
|
|
|
if (session_hparams != ctx->model.hparams) {
|
|
LLAMA_LOG_INFO("%s : model hparams didn't match from session file!\n", __func__);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// load the prompt
|
|
{
|
|
const uint32_t n_token_count = file.read_u32();
|
|
|
|
if (n_token_count > n_token_capacity) {
|
|
LLAMA_LOG_ERROR("%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
|
|
return false;
|
|
}
|
|
|
|
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
|
|
*n_token_count_out = n_token_count;
|
|
}
|
|
|
|
// restore the context state
|
|
{
|
|
const size_t n_state_size_cur = file.size - file.tell();
|
|
const size_t n_state_size_max = llama_get_state_size(ctx);
|
|
|
|
if (n_state_size_cur > n_state_size_max) {
|
|
LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
|
|
return false;
|
|
}
|
|
|
|
std::vector<uint8_t> state_data(n_state_size_max);
|
|
file.read_raw(state_data.data(), n_state_size_cur);
|
|
|
|
llama_set_state_data(ctx, state_data.data());
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
|
try {
|
|
return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
|
} catch (const std::exception & err) {
|
|
LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
|
llama_file file(path_session, "wb");
|
|
|
|
file.write_u32(LLAMA_SESSION_MAGIC);
|
|
file.write_u32(LLAMA_SESSION_VERSION);
|
|
|
|
file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
|
|
|
|
// save the prompt
|
|
file.write_u32((uint32_t) n_token_count);
|
|
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
|
|
|
|
// save the context state using stream saving
|
|
llama_data_file_context data_ctx(&file);
|
|
llama_copy_state_data_internal(ctx, &data_ctx);
|
|
|
|
return true;
|
|
}
|
|
|
|
int llama_eval(
|
|
struct llama_context * ctx,
|
|
const llama_token * tokens,
|
|
int n_tokens,
|
|
int n_past,
|
|
int n_threads) {
|
|
if (!llama_eval_internal(*ctx, tokens, nullptr, n_tokens, n_past, n_threads, nullptr)) {
|
|
LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
|
|
return 1;
|
|
}
|
|
|
|
// get a more accurate load time, upon first eval
|
|
// TODO: fix this
|
|
if (!ctx->has_evaluated_once) {
|
|
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
|
ctx->has_evaluated_once = true;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int llama_eval_embd(
|
|
struct llama_context * ctx,
|
|
const float * embd,
|
|
int n_tokens,
|
|
int n_past,
|
|
int n_threads) {
|
|
if (!llama_eval_internal(*ctx, nullptr, embd, n_tokens, n_past, n_threads, nullptr)) {
|
|
LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
|
|
return 1;
|
|
}
|
|
|
|
// get a more accurate load time, upon first eval
|
|
// TODO: fix this
|
|
if (!ctx->has_evaluated_once) {
|
|
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
|
|
ctx->has_evaluated_once = true;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int llama_eval_export(struct llama_context * ctx, const char * fname) {
|
|
const int n_batch = 1;
|
|
const int n_ctx = 512 - n_batch;
|
|
|
|
const std::vector<llama_token> tmp(n_batch, llama_token_bos(ctx));
|
|
|
|
if (!llama_eval_internal(*ctx, tmp.data(), nullptr, tmp.size(), n_ctx, 1, fname)) {
|
|
LLAMA_LOG_ERROR("%s: failed to eval\n", __func__);
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
float * llama_get_logits(struct llama_context * ctx) {
|
|
return ctx->logits.data();
|
|
}
|
|
|
|
float * llama_get_embeddings(struct llama_context * ctx) {
|
|
return ctx->embedding.data();
|
|
}
|
|
|
|
const char * llama_token_get_text(const struct llama_context * ctx, llama_token token) {
|
|
return ctx->model.vocab.id_to_token[token].text.c_str();
|
|
}
|
|
|
|
float llama_token_get_score(const struct llama_context * ctx, llama_token token) {
|
|
return ctx->model.vocab.id_to_token[token].score;
|
|
}
|
|
|
|
llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token) {
|
|
return ctx->model.vocab.id_to_token[token].type;
|
|
}
|
|
|
|
llama_token llama_token_bos(const struct llama_context * ctx) {
|
|
return ctx->model.vocab.special_bos_id;
|
|
}
|
|
|
|
llama_token llama_token_eos(const struct llama_context * ctx) {
|
|
return ctx->model.vocab.special_eos_id;
|
|
}
|
|
|
|
llama_token llama_token_nl(const struct llama_context * ctx) {
|
|
return ctx->model.vocab.linefeed_id;
|
|
}
|
|
|
|
int llama_tokenize(
|
|
struct llama_context * ctx,
|
|
const char * text,
|
|
llama_token * tokens,
|
|
int n_max_tokens,
|
|
bool add_bos) {
|
|
return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
|
|
}
|
|
|
|
int llama_tokenize_with_model(
|
|
const struct llama_model * model,
|
|
const char * text,
|
|
llama_token * tokens,
|
|
int n_max_tokens,
|
|
bool add_bos) {
|
|
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
|
|
|
|
if (n_max_tokens < (int) res.size()) {
|
|
LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
|
return -((int) res.size());
|
|
}
|
|
|
|
for (size_t i = 0; i < res.size(); i++) {
|
|
tokens[i] = res[i];
|
|
}
|
|
|
|
return res.size();
|
|
}
|
|
|
|
int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) {
|
|
return llama_token_to_piece_with_model(&ctx->model, token, buf, length);
|
|
}
|
|
|
|
// does not write null-terminator to buf
|
|
int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
|
|
if (0 <= token && token < llama_model_n_vocab(model)) {
|
|
if (llama_is_normal_token(model->vocab, token)) {
|
|
std::string result = model->vocab.id_to_token[token].text;
|
|
if (llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM) {
|
|
llama_unescape_whitespace(result);
|
|
}
|
|
if (length < (int) result.length()) {
|
|
return -result.length();
|
|
}
|
|
memcpy(buf, result.c_str(), result.length());
|
|
return result.length();
|
|
} else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
|
|
if (length < 3) {
|
|
return -3;
|
|
}
|
|
buf[0] = '\xe2';
|
|
buf[1] = '\x96';
|
|
buf[2] = '\x85';
|
|
return 3;
|
|
} else if (llama_is_control_token(model->vocab, token)) {
|
|
;
|
|
} else if (llama_is_byte_token(model->vocab, token)) {
|
|
if (length < 1) {
|
|
return -1;
|
|
}
|
|
buf[0] = llama_token_to_byte(model->vocab, token);
|
|
return 1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
|
struct llama_timings result = {
|
|
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
|
/*.t_end_ms =*/ 1.00 * ggml_time_ms(),
|
|
/*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
|
|
/*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
|
|
/*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
|
|
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
|
|
|
/*.n_sample =*/ std::max(1, ctx->n_sample),
|
|
/*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
|
|
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
|
};
|
|
|
|
return result;
|
|
}
|
|
|
|
void llama_print_timings(struct llama_context * ctx) {
|
|
const llama_timings timings = llama_get_timings(ctx);
|
|
|
|
LLAMA_LOG_INFO("\n");
|
|
LLAMA_LOG_INFO("%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
|
|
LLAMA_LOG_INFO("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
|
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
|
|
LLAMA_LOG_INFO("%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
|
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
|
LLAMA_LOG_INFO("%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
|
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
|
LLAMA_LOG_INFO("%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
|
}
|
|
|
|
void llama_reset_timings(struct llama_context * ctx) {
|
|
ctx->t_start_us = ggml_time_us();
|
|
ctx->t_sample_us = ctx->n_sample = 0;
|
|
ctx->t_eval_us = ctx->n_eval = 0;
|
|
ctx->t_p_eval_us = ctx->n_p_eval = 0;
|
|
}
|
|
|
|
const char * llama_print_system_info(void) {
|
|
static std::string s;
|
|
|
|
s = "";
|
|
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
|
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
|
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
|
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|
|
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
|
|
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
|
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
|
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
|
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
|
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
|
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
|
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
|
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
|
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
|
|
|
return s.c_str();
|
|
}
|
|
|
|
// For internal test use
|
|
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
|
return ctx->model.tensors_by_name;
|
|
}
|
|
|
|
void llama_log_set(llama_log_callback log_callback, void * user_data) {
|
|
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
|
g_state.log_callback_user_data = user_data;
|
|
}
|
|
|
|
#if defined(_MSC_VER) && !defined(vsnprintf)
|
|
#define vsnprintf _vsnprintf
|
|
#endif
|
|
|
|
static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) {
|
|
va_list args_copy;
|
|
va_copy(args_copy, args);
|
|
char buffer[128];
|
|
int len = vsnprintf(buffer, 128, format, args);
|
|
if (len < 128) {
|
|
g_state.log_callback(level, buffer, g_state.log_callback_user_data);
|
|
} else {
|
|
char* buffer2 = new char[len+1];
|
|
vsnprintf(buffer2, len+1, format, args_copy);
|
|
buffer2[len] = 0;
|
|
g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
|
|
delete[] buffer2;
|
|
}
|
|
va_end(args_copy);
|
|
}
|
|
|
|
static void llama_log_internal(llama_log_level level, const char * format, ...) {
|
|
va_list args;
|
|
va_start(args, format);
|
|
llama_log_internal_v(level, format, args);
|
|
va_end(args);
|
|
}
|
|
|
|
static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data) {
|
|
(void) level;
|
|
(void) user_data;
|
|
fputs(text, stderr);
|
|
fflush(stderr);
|
|
}
|