mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
Merge branch 'upstream' into concedo
# Conflicts: # README.md # llama.cpp # otherarch/sdcpp/SDCPP_LICENSE # scripts/sync-ggml-am.sh # scripts/sync-ggml.sh
This commit is contained in:
parent
44c384d918
commit
d1bb126605
33 changed files with 4005 additions and 2204 deletions
655
AUTHORS
Normal file
655
AUTHORS
Normal file
|
@ -0,0 +1,655 @@
|
||||||
|
# date: Tue Apr 9 09:17:14 EEST 2024
|
||||||
|
# this file is auto-generated by scripts/gen-authors.sh
|
||||||
|
|
||||||
|
0cc4m <picard12@live.de>
|
||||||
|
0xspringtime <110655352+0xspringtime@users.noreply.github.com>
|
||||||
|
2f38b454 <dxf@protonmail.com>
|
||||||
|
3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
|
||||||
|
44670 <44670@users.noreply.github.com>
|
||||||
|
AN Long <aisk@users.noreply.github.com>
|
||||||
|
AT <manyoso@users.noreply.github.com>
|
||||||
|
Aarni Koskela <akx@iki.fi>
|
||||||
|
Aaron Miller <apage43@ninjawhale.com>
|
||||||
|
Aaryaman Vasishta <aaryaman.vasishta@amd.com>
|
||||||
|
Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
|
||||||
|
Abhishek Gopinath K <31348521+overtunned@users.noreply.github.com>
|
||||||
|
Adithya Balaji <adithya.b94@gmail.com>
|
||||||
|
AdithyanI <adithyan.i4internet@gmail.com>
|
||||||
|
Adrian <smith.adriane@gmail.com>
|
||||||
|
Adrian Hesketh <a-h@users.noreply.github.com>
|
||||||
|
AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
|
||||||
|
Aisuko <urakiny@gmail.com>
|
||||||
|
Alberto <57916483+albbus-stack@users.noreply.github.com>
|
||||||
|
Alex <awhill19@icloud.com>
|
||||||
|
Alex Azarov <alex@azarov.by>
|
||||||
|
Alex Azarov <alexander.azarov@mapbox.com>
|
||||||
|
Alex Klinkhamer <from.github.com.917@grencez.dev>
|
||||||
|
Alex Klinkhamer <git@grencez.dev>
|
||||||
|
Alex Nguyen <tiendung@users.noreply.github.com>
|
||||||
|
Alex Petenchea <alex.petenchea@gmail.com>
|
||||||
|
Alex Renda <alexrenda@users.noreply.github.com>
|
||||||
|
Alex von Gluck IV <kallisti5@unixzen.com>
|
||||||
|
Alexey Parfenov <zxed@alkatrazstudio.net>
|
||||||
|
Ali Chraghi <63465728+alichraghi@users.noreply.github.com>
|
||||||
|
Ali Nehzat <ali.nehzat@thanks.dev>
|
||||||
|
Ali Tariq <ali.tariq@10xengineers.ai>
|
||||||
|
Alon <alonfaraj@gmail.com>
|
||||||
|
AlpinDale <52078762+AlpinDale@users.noreply.github.com>
|
||||||
|
AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
|
||||||
|
Ananta Bastola <anantarajbastola@gmail.com>
|
||||||
|
Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
|
||||||
|
András Salamon <ott2@users.noreply.github.com>
|
||||||
|
Andrei <abetlen@gmail.com>
|
||||||
|
Andrew Canis <andrew.canis@gmail.com>
|
||||||
|
Andrew Duffy <a10y@users.noreply.github.com>
|
||||||
|
Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
|
||||||
|
Arik Poznanski <arikpoz@users.noreply.github.com>
|
||||||
|
Artem <guinmoon@gmail.com>
|
||||||
|
Artyom Lebedev <vagran.ast@gmail.com>
|
||||||
|
Asbjørn Olling <asbjornolling@gmail.com>
|
||||||
|
Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
|
||||||
|
Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
|
||||||
|
Ashraful Islam <ashraful.meche@gmail.com>
|
||||||
|
Atsushi Tatsuma <yoshoku@outlook.com>
|
||||||
|
Austin <77757836+teleprint-me@users.noreply.github.com>
|
||||||
|
AustinMroz <austinmroz@utexas.edu>
|
||||||
|
BADR <contact@pythops.com>
|
||||||
|
Bach Le <bach@bullno1.com>
|
||||||
|
Bailey Chittle <39804642+bachittle@users.noreply.github.com>
|
||||||
|
BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com>
|
||||||
|
Behnam M <58621210+ibehnam@users.noreply.github.com>
|
||||||
|
Ben Garney <bengarney@users.noreply.github.com>
|
||||||
|
Ben Siraphob <bensiraphob@gmail.com>
|
||||||
|
Ben Williams <ben@719ben.com>
|
||||||
|
Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
|
||||||
|
Bernat Vadell <hounter.caza@gmail.com>
|
||||||
|
Bodo Graumann <mail@bodograumann.de>
|
||||||
|
Bono Lv <lvscar@users.noreply.github.com>
|
||||||
|
Borislav Stanimirov <b.stanimirov@abv.bg>
|
||||||
|
Branden Butler <bwtbutler@hotmail.com>
|
||||||
|
Brian <mofosyne@gmail.com>
|
||||||
|
Bruce MacDonald <brucewmacdonald@gmail.com>
|
||||||
|
CJ Pais <cj@cjpais.com>
|
||||||
|
CRD716 <crd716@gmail.com>
|
||||||
|
Cameron <csteele@steelecameron.com>
|
||||||
|
Cameron Kaiser <classilla@users.noreply.github.com>
|
||||||
|
Casey Primozic <casey@cprimozic.net>
|
||||||
|
Casey Primozic <me@ameo.link>
|
||||||
|
CausalLM <148736309+CausalLM@users.noreply.github.com>
|
||||||
|
Cebtenzzre <cebtenzzre@gmail.com>
|
||||||
|
Chad Brewbaker <crb002@gmail.com>
|
||||||
|
Cheng Shao <terrorjack@type.dance>
|
||||||
|
Chris Kuehl <ckuehl@ckuehl.me>
|
||||||
|
Christian Demsar <christian@github.email.demsar.us>
|
||||||
|
Christian Demsar <crasm@git.vczf.us>
|
||||||
|
Christian Falch <875252+chrfalch@users.noreply.github.com>
|
||||||
|
Christian Kögler <ck3d@gmx.de>
|
||||||
|
Clark Saben <76020733+csaben@users.noreply.github.com>
|
||||||
|
Clint Herron <hanclinto@gmail.com>
|
||||||
|
Cuong Trinh Manh <nguoithichkhampha@gmail.com>
|
||||||
|
DAN™ <dranger003@gmail.com>
|
||||||
|
Damian Stewart <d@damianstewart.com>
|
||||||
|
Dane Madsen <dane_madsen@hotmail.com>
|
||||||
|
DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com>
|
||||||
|
Daniel Bevenius <daniel.bevenius@gmail.com>
|
||||||
|
Daniel Drake <drake@endlessos.org>
|
||||||
|
Daniel Hiltgen <dhiltgen@users.noreply.github.com>
|
||||||
|
Daniel Illescas Romero <illescas.daniel@protonmail.com>
|
||||||
|
DannyDaemonic <DannyDaemonic@gmail.com>
|
||||||
|
Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
|
||||||
|
Dave Della Costa <ddellacosta+github@gmail.com>
|
||||||
|
David Friehs <david@friehs.info>
|
||||||
|
David Kennedy <dakennedyd@gmail.com>
|
||||||
|
David Pflug <david@pflug.email>
|
||||||
|
David Renshaw <dwrenshaw@gmail.com>
|
||||||
|
David Sommers <12738+databyte@users.noreply.github.com>
|
||||||
|
David Yang <davidyang6us@gmail.com>
|
||||||
|
Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
|
||||||
|
Dean <Dean.Sinaean@gmail.com>
|
||||||
|
Deins <deinsegle@gmail.com>
|
||||||
|
Didzis Gosko <didzis@users.noreply.github.com>
|
||||||
|
Don Mahurin <dmahurin@users.noreply.github.com>
|
||||||
|
DooWoong Lee (David) <manics99@naver.com>
|
||||||
|
Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
|
||||||
|
Douglas Hanley <thesecretaryofwar@gmail.com>
|
||||||
|
Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
|
||||||
|
Ebey Abraham <ebey97@gmail.com>
|
||||||
|
Ed Lee <edilee@mozilla.com>
|
||||||
|
Ed Lepedus <ed.lepedus@googlemail.com>
|
||||||
|
Edward Taylor <edeetee@gmail.com>
|
||||||
|
Elbios <141279586+Elbios@users.noreply.github.com>
|
||||||
|
Engininja2 <139037756+Engininja2@users.noreply.github.com>
|
||||||
|
Equim <sayaka@ekyu.moe>
|
||||||
|
Eric Sommerlade <es0m@users.noreply.github.com>
|
||||||
|
Eric Zhang <34133756+EZForever@users.noreply.github.com>
|
||||||
|
Erik Garrison <erik.garrison@gmail.com>
|
||||||
|
Erik Scholz <Green-Sky@users.noreply.github.com>
|
||||||
|
Ettore Di Giacinto <mudler@users.noreply.github.com>
|
||||||
|
Evan Jones <evan.q.jones@gmail.com>
|
||||||
|
Evan Miller <emmiller@gmail.com>
|
||||||
|
Eve <139727413+netrunnereve@users.noreply.github.com>
|
||||||
|
Evgeny Kurnevsky <kurnevsky@gmail.com>
|
||||||
|
Ewout ter Hoeven <E.M.terHoeven@student.tudelft.nl>
|
||||||
|
ExtReMLapin <3909752+ExtReMLapin@users.noreply.github.com>
|
||||||
|
FK <sozforex@gmail.com>
|
||||||
|
Fabian <cmdrf@users.noreply.github.com>
|
||||||
|
Fabio R. Sluzala <Fabio3rs@users.noreply.github.com>
|
||||||
|
Faez Shakil <faez.shakil@gmail.com>
|
||||||
|
FantasyGmm <16450052+FantasyGmm@users.noreply.github.com>
|
||||||
|
Fattire <528174+fat-tire@users.noreply.github.com>
|
||||||
|
Felix <stenbackfelix@gmail.com>
|
||||||
|
Finn Voorhees <finnvoorhees@gmail.com>
|
||||||
|
Firat <firatkiral@gmail.com>
|
||||||
|
Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
|
||||||
|
Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
|
||||||
|
Francisco Melo <43780565+francis2tm@users.noreply.github.com>
|
||||||
|
FrankHB <frankhb1989@gmail.com>
|
||||||
|
Frederik Vogel <Schaltfehler@users.noreply.github.com>
|
||||||
|
Gabe Goodhart <gabe.l.hart@gmail.com>
|
||||||
|
GainLee <perfecter.gen@gmail.com>
|
||||||
|
Galunid <karolek1231456@gmail.com>
|
||||||
|
Gary Linscott <glinscott@gmail.com>
|
||||||
|
Gary Mulder <gjmulder@gmail.com>
|
||||||
|
Genkagaku.GPT <hlhr202@163.com>
|
||||||
|
Georgi Gerganov <ggerganov@gmail.com>
|
||||||
|
Gilad S <giladgd@users.noreply.github.com>
|
||||||
|
GiviMAD <GiviMAD@users.noreply.github.com>
|
||||||
|
Govlzkoy <gotope@users.noreply.github.com>
|
||||||
|
Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
|
||||||
|
Guillaume Wenzek <gwenzek@users.noreply.github.com>
|
||||||
|
Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
|
||||||
|
Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
|
||||||
|
Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
|
||||||
|
Haohui Mai <ricetons@gmail.com>
|
||||||
|
Haoxiang Fei <tonyfettes@tonyfettes.com>
|
||||||
|
Harald Fernengel <harald.fernengel@here.com>
|
||||||
|
Hatsune Miku <129688334+at8u@users.noreply.github.com>
|
||||||
|
Henk Poley <HenkPoley@gmail.com>
|
||||||
|
Henri Vasserman <henv@hot.ee>
|
||||||
|
Henrik Forstén <henrik.forsten@gmail.com>
|
||||||
|
Herman Semenov <GermanAizek@yandex.ru>
|
||||||
|
Hesen Peng <hesen.peng@gmail.com>
|
||||||
|
Hoang Nguyen <hugo53@users.noreply.github.com>
|
||||||
|
Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
|
||||||
|
Howard Su <howard0su@gmail.com>
|
||||||
|
Hua Jiang <allenhjiang@outlook.com>
|
||||||
|
Huawei Lin <huaweilin.cs@gmail.com>
|
||||||
|
Ian Bull <irbull@eclipsesource.com>
|
||||||
|
Ian Bull <irbull@gmail.com>
|
||||||
|
Ian Scrivener <github@zilogy.asia>
|
||||||
|
Ido S <ido.pluto@gmail.com>
|
||||||
|
IgnacioFDM <ignaciofdm@gmail.com>
|
||||||
|
Igor Okulist <okigan@gmail.com>
|
||||||
|
Ikko Eltociear Ashimine <eltociear@gmail.com>
|
||||||
|
Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
|
||||||
|
Ionoclast Laboratories <brigham@ionoclast.com>
|
||||||
|
Isaac McFadyen <isaac@imcf.me>
|
||||||
|
IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com>
|
||||||
|
Ivan Komarov <Ivan.Komarov@dfyz.info>
|
||||||
|
Ivan Stepanov <ivanstepanovftw@gmail.com>
|
||||||
|
JH23X <165871467+JH23X@users.noreply.github.com>
|
||||||
|
Jack Mousseau <jmousseau@users.noreply.github.com>
|
||||||
|
JackJollimore <130917767+JackJollimore@users.noreply.github.com>
|
||||||
|
Jag Chadha <jagtesh@gmail.com>
|
||||||
|
Jakub N <jakubniemczyk97@gmail.com>
|
||||||
|
James Reynolds <magnusviri@users.noreply.github.com>
|
||||||
|
Jan Boon <jan.boon@kaetemi.be>
|
||||||
|
Jan Boon <kaetemi@gmail.com>
|
||||||
|
Jan Ploski <jpl@plosquare.com>
|
||||||
|
Jannis Schönleber <joennlae@gmail.com>
|
||||||
|
Jared Van Bortel <cebtenzzre@gmail.com>
|
||||||
|
Jared Van Bortel <jared@nomic.ai>
|
||||||
|
Jason McCartney <jmac@theroot.org>
|
||||||
|
Jean-Christophe Hoelt <hoelt@fovea.cc>
|
||||||
|
Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
|
||||||
|
Jed Fox <git@jedfox.com>
|
||||||
|
Jeffrey Quesnelle <emozilla@nousresearch.com>
|
||||||
|
Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
|
||||||
|
Jhen-Jie Hong <iainst0409@gmail.com>
|
||||||
|
Jiahao Li <liplus17@163.com>
|
||||||
|
Jian Liao <jianliao@users.noreply.github.com>
|
||||||
|
JidongZhang-THU <1119708529@qq.com>
|
||||||
|
Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com>
|
||||||
|
Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
|
||||||
|
Johannes Gäßler <johannesg@5d6.de>
|
||||||
|
Johannes Rudolph <johannes.rudolph@gmail.com>
|
||||||
|
John <78893154+cmp-nct@users.noreply.github.com>
|
||||||
|
John Balis <phobossystems@gmail.com>
|
||||||
|
John Smith <67539080+kingsidelee@users.noreply.github.com>
|
||||||
|
JohnnyB <jboero@users.noreply.github.com>
|
||||||
|
Jonas Wunderlich <32615971+jonas-w@users.noreply.github.com>
|
||||||
|
Jorge A <161275481+jorgealias@users.noreply.github.com>
|
||||||
|
Jose Maldonado <63384398+yukiteruamano@users.noreply.github.com>
|
||||||
|
Joseph Stahl <1269177+josephst@users.noreply.github.com>
|
||||||
|
Joyce <joycebrum@google.com>
|
||||||
|
Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
|
||||||
|
Judd <foldl@users.noreply.github.com>
|
||||||
|
Julius Arkenberg <arki05@users.noreply.github.com>
|
||||||
|
Jun Jie <71215065+junnjiee16@users.noreply.github.com>
|
||||||
|
Juraj Bednar <juraj@bednar.io>
|
||||||
|
Justin Parker <jparkerweb@gmail.com>
|
||||||
|
Justin Suess <justin.suess@westpoint.edu>
|
||||||
|
Justine Tunney <jtunney@gmail.com>
|
||||||
|
Juuso Alasuutari <juuso.alasuutari@gmail.com>
|
||||||
|
KASR <karim.asrih@gmail.com>
|
||||||
|
Kamil Tomšík <info@tomsik.cz>
|
||||||
|
Karsten Weiss <knweiss@gmail.com>
|
||||||
|
Karthick <j.karthic2004@gmail.com>
|
||||||
|
Karthik Kumar Viswanathan <195178+guilt@users.noreply.github.com>
|
||||||
|
Karthik Sethuraman <k.seth1993@gmail.com>
|
||||||
|
Kasumi <90275229+kasumi-1@users.noreply.github.com>
|
||||||
|
Kawrakow <48489457+ikawrakow@users.noreply.github.com>
|
||||||
|
Keiichi Tabata <keiichi.tabata@outlook.com>
|
||||||
|
Kenvix ⭐ <kenvixzure@live.com>
|
||||||
|
Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
|
||||||
|
Kevin Ji <1146876+kevinji@users.noreply.github.com>
|
||||||
|
Kevin Kwok <antimatter15@gmail.com>
|
||||||
|
Kevin Lo <kevlo@kevlo.org>
|
||||||
|
Kolen Cheung <ickc@users.noreply.github.com>
|
||||||
|
Konstantin Herud <konstantin.herud@denkbares.com>
|
||||||
|
Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
|
||||||
|
Kunshang Ji <kunshang.ji@intel.com>
|
||||||
|
Kyle Liang <liangmanlai@gmail.com>
|
||||||
|
Kyle Mistele <kyle@mistele.com>
|
||||||
|
Kylin <56434533+KyL0N@users.noreply.github.com>
|
||||||
|
Lars Grammel <lars.grammel@gmail.com>
|
||||||
|
Laura <Tijntje_7@msn.com>
|
||||||
|
Lee <44310445+lx200916@users.noreply.github.com>
|
||||||
|
Lee Drake <b.lee.drake@gmail.com>
|
||||||
|
Leng Yue <lengyue@lengyue.me>
|
||||||
|
LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
|
||||||
|
Leonardo Neumann <leonardo@neumann.dev.br>
|
||||||
|
Li Tan <tanliboy@gmail.com>
|
||||||
|
Linwei Wang <wanix1988@gmail.com>
|
||||||
|
LoganDark <github@logandark.mozmail.com>
|
||||||
|
LostRuins <39025047+LostRuins@users.noreply.github.com>
|
||||||
|
Luciano <lucianostrika44@gmail.com>
|
||||||
|
Luo Tian <lt@basecity.com>
|
||||||
|
M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
|
||||||
|
Maarten ter Huurne <maarten@treewalker.org>
|
||||||
|
Mack Straight <eiz@users.noreply.github.com>
|
||||||
|
Maël Kerbiriou <m431.kerbiriou@gmail.com>
|
||||||
|
MaggotHATE <clay1326@gmail.com>
|
||||||
|
Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
|
||||||
|
Marco Matthies <71844+marcom@users.noreply.github.com>
|
||||||
|
Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
|
||||||
|
Marian Cepok <marian.cepok@gmail.com>
|
||||||
|
Mark Fairbairn <thebaron88@gmail.com>
|
||||||
|
Marko Tasic <mtasic85@gmail.com>
|
||||||
|
Martin Krasser <krasserm@googlemail.com>
|
||||||
|
Martin Schwaighofer <mschwaig@users.noreply.github.com>
|
||||||
|
Marvin Gießing <marvin.giessing@gmail.com>
|
||||||
|
Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
|
||||||
|
Matheus C. França <matheus-catarino@hotmail.com>
|
||||||
|
Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
|
||||||
|
Mathieu Nayrolles <MathieuNls@users.noreply.github.com>
|
||||||
|
Mathijs de Bruin <mathijs@mathijsfietst.nl>
|
||||||
|
Matt Clayton <156335168+mattjcly@users.noreply.github.com>
|
||||||
|
Matt Pulver <matt.pulver@heavy.ai>
|
||||||
|
Matteo Boschini <12133566+mbosc@users.noreply.github.com>
|
||||||
|
Matthew Tejo <matthew.tejo@gmail.com>
|
||||||
|
Matvey Soloviev <blackhole89@gmail.com>
|
||||||
|
Maxime <672982+maximegmd@users.noreply.github.com>
|
||||||
|
Maximilian Winter <maximilian.winter.91@gmail.com>
|
||||||
|
Meng Zhang <meng@tabbyml.com>
|
||||||
|
Meng, Hengyu <hengyu.meng@intel.com>
|
||||||
|
Merrick Christensen <merrick.christensen@gmail.com>
|
||||||
|
Michael Coppola <m18coppola@gmail.com>
|
||||||
|
Michael Hueschen <m@mhueschen.dev>
|
||||||
|
Michael Kesper <mkesper@schokokeks.org>
|
||||||
|
Michael Klimenko <mklimenko29@gmail.com>
|
||||||
|
Michael Podvitskiy <podvitskiymichael@gmail.com>
|
||||||
|
Michael Potter <NanoTekGuy@Gmail.com>
|
||||||
|
Michaël de Vries <vriesdemichael@gmail.com>
|
||||||
|
Mihai <mihai.chirculescu@yahoo.com>
|
||||||
|
Mike <ytianhui2004@gmail.com>
|
||||||
|
Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
|
||||||
|
Mirko185 <mirkosig@gmail.com>
|
||||||
|
Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
|
||||||
|
Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
|
||||||
|
Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
|
||||||
|
Murilo Santana <mvrilo@gmail.com>
|
||||||
|
Musab Gultekin <musabgultekin@users.noreply.github.com>
|
||||||
|
Nam D. Tran <42194884+namtranase@users.noreply.github.com>
|
||||||
|
NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
|
||||||
|
Nebula <infinitewormhole@gmail.com>
|
||||||
|
Neo Zhang Jianyu <jianyu.zhang@intel.com>
|
||||||
|
Neuman Vong <neuman.vong@gmail.com>
|
||||||
|
Nexesenex <124105151+Nexesenex@users.noreply.github.com>
|
||||||
|
Niall Coates <1349685+Niall-@users.noreply.github.com>
|
||||||
|
Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
|
||||||
|
Nigel Bosch <pnigelb@gmail.com>
|
||||||
|
Niklas Korz <niklas@niklaskorz.de>
|
||||||
|
Nindaleth <Nindaleth@users.noreply.github.com>
|
||||||
|
Oleksandr Nikitin <oleksandr@tvori.info>
|
||||||
|
Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
|
||||||
|
Olivier Chafik <ochafik@users.noreply.github.com>
|
||||||
|
Ondřej Čertík <ondrej@certik.us>
|
||||||
|
Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
|
||||||
|
Paul Tsochantaris <ptsochantaris@icloud.com>
|
||||||
|
Pavol Rusnak <pavol@rusnak.io>
|
||||||
|
Pedro Cuenca <pedro@huggingface.co>
|
||||||
|
Peter Sugihara <peter@campsh.com>
|
||||||
|
Phil H <5756783+phiharri@users.noreply.github.com>
|
||||||
|
Philip Taron <philip.taron@gmail.com>
|
||||||
|
Phillip Kravtsov <phillip@kravtsov.net>
|
||||||
|
Pierre Alexandre SCHEMBRI <pa.schembri@gmail.com>
|
||||||
|
Pierrick Hymbert <pierrick.hymbert@gmail.com>
|
||||||
|
Przemysław Pawełczyk <przemoc@gmail.com>
|
||||||
|
Qin Yue Chen <71813199+chenqiny@users.noreply.github.com>
|
||||||
|
Qingyou Meng <meng.qingyou@gmail.com>
|
||||||
|
Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com>
|
||||||
|
RJ Adriaansen <adriaansen@eshcc.eur.nl>
|
||||||
|
Radoslav Gerganov <rgerganov@gmail.com>
|
||||||
|
Radosław Gryta <radek.gryta@gmail.com>
|
||||||
|
Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com>
|
||||||
|
Rand Xie <randxiexyy29@gmail.com>
|
||||||
|
Randall Fitzgerald <randall@dasaku.net>
|
||||||
|
Reinforce-II <fate@eastal.com>
|
||||||
|
Riceball LEE <snowyu.lee@gmail.com>
|
||||||
|
Richard Kiss <him@richardkiss.com>
|
||||||
|
Richard Roberson <richardr1126@gmail.com>
|
||||||
|
Rick G <26732651+TheFlipbook@users.noreply.github.com>
|
||||||
|
Rickard Edén <rickardeden@gmail.com>
|
||||||
|
Rickard Hallerbäck <rickard.hallerback@gmail.com>
|
||||||
|
Rickey Bowers Jr <bitRAKE@gmail.com>
|
||||||
|
Riley Stewart <ristew@users.noreply.github.com>
|
||||||
|
Rinne <AsakusaRinne@gmail.com>
|
||||||
|
Rinne <liu_yaohui1998@126.com>
|
||||||
|
Robert Brisita <986796+rbrisita@users.noreply.github.com>
|
||||||
|
Robert Sung-wook Shin <edp1096@users.noreply.github.com>
|
||||||
|
Robey Holderith <robey@flaminglunchbox.net>
|
||||||
|
Robyn <robyngraf@users.noreply.github.com>
|
||||||
|
Roger Meier <r.meier@siemens.com>
|
||||||
|
Roland <14355895+rbur0425@users.noreply.github.com>
|
||||||
|
Romain D <90720+Artefact2@users.noreply.github.com>
|
||||||
|
Romain Neutron <romain@neutron.io>
|
||||||
|
Roman Parykin <donderom@gmail.com>
|
||||||
|
Ron Evans <ron@hybridgroup.com>
|
||||||
|
Ron Jailall <rojailal@gmail.com>
|
||||||
|
Ronny Brendel <ronnybrendel@gmail.com>
|
||||||
|
Ronsor <ronsor@ronsor.pw>
|
||||||
|
Rowan Hart <rowanbhart@gmail.com>
|
||||||
|
Rune <43761327+Rune-AI@users.noreply.github.com>
|
||||||
|
Ryan Landay <rlanday@gmail.com>
|
||||||
|
Ryder Wishart <ryderwishart@gmail.com>
|
||||||
|
Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
|
||||||
|
SakuraUmi <yukinon244@gmail.com>
|
||||||
|
Salvador E. Tropea <stropea@inti.gob.ar>
|
||||||
|
Sam Spilsbury <smspillaz@gmail.com>
|
||||||
|
Sami Farin <3876865+Safari77@users.noreply.github.com>
|
||||||
|
Samuel Maynard <samwmaynard@gmail.com>
|
||||||
|
Sang-Kil Park <sang.park@42dot.ai>
|
||||||
|
Seb C <47074056+Sebby37@users.noreply.github.com>
|
||||||
|
Sebastián A <sebastian.aedo29@gmail.com>
|
||||||
|
SebastianApel <13675545+SebastianApel@users.noreply.github.com>
|
||||||
|
Senemu <10880819+Senemu@users.noreply.github.com>
|
||||||
|
Sergey Alirzaev <zl29ah@gmail.com>
|
||||||
|
Sergio López <slp@sinrega.org>
|
||||||
|
SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
|
||||||
|
ShadovvBeast <ShadovvBeast@gmail.com>
|
||||||
|
Shakhar Dasgupta <shakhardasgupta@gmail.com>
|
||||||
|
Shangning Xu <32517059+xushangning@users.noreply.github.com>
|
||||||
|
Shijie <821898965@qq.com>
|
||||||
|
Shintarou Okada <kokuzen@gmail.com>
|
||||||
|
Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
|
||||||
|
Shouzheng Liu <lshzh.hi@gmail.com>
|
||||||
|
Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
|
||||||
|
Simon Willison <swillison@gmail.com>
|
||||||
|
Siwen Yu <yusiwen@gmail.com>
|
||||||
|
Sky Yan <skyan83@gmail.com>
|
||||||
|
Slaren <2141330+slaren@users.noreply.github.com>
|
||||||
|
Slava Primenko <primenko.s@gmail.com>
|
||||||
|
SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com>
|
||||||
|
Someone <sergei.kozlukov@aalto.fi>
|
||||||
|
Someone Serge <sergei.kozlukov@aalto.fi>
|
||||||
|
Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
|
||||||
|
Spencer Sutton <spencersutton@users.noreply.github.com>
|
||||||
|
Srinivas Billa <nivibilla@gmail.com>
|
||||||
|
Stefan Sydow <stefan@sydow.email>
|
||||||
|
Stephan Walter <stephan@walter.name>
|
||||||
|
Stephen Nichols <snichols@users.noreply.github.com>
|
||||||
|
Steve Grubb <ausearch.1@gmail.com>
|
||||||
|
Steven Roussey <sroussey@gmail.com>
|
||||||
|
Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
|
||||||
|
Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
|
||||||
|
SuperUserNameMan <yoann@terminajones.com>
|
||||||
|
Tai Duc Nguyen <taiducnguyen.drexel@gmail.com>
|
||||||
|
Taikono-Himazin <kazu@po.harenet.ne.jp>
|
||||||
|
Tameem <113388789+AhmadTameem@users.noreply.github.com>
|
||||||
|
Tamotsu Takahashi <ttakah+github@gmail.com>
|
||||||
|
Thái Hoàng Tâm <75922889+RoyalHeart@users.noreply.github.com>
|
||||||
|
Thatcher Chamberlin <j.thatcher.c@gmail.com>
|
||||||
|
Theia Vogel <theia@vgel.me>
|
||||||
|
Thérence <13496987+Royalphax@users.noreply.github.com>
|
||||||
|
Thibault Terrasson <thibault.terrasson@gmail.com>
|
||||||
|
Thomas Klausner <wiz@gatalith.at>
|
||||||
|
Tim Miller <drasticactions@users.noreply.github.com>
|
||||||
|
Timmy Knight <r2d2fish@gmail.com>
|
||||||
|
Timothy Cronin <40186632+4imothy@users.noreply.github.com>
|
||||||
|
Ting Lou <ting.lou@gmail.com>
|
||||||
|
Ting Sun <suntcrick@gmail.com>
|
||||||
|
Tobias Lütke <tobi@shopify.com>
|
||||||
|
Tom C <tom.corelis@gmail.com>
|
||||||
|
Tom Jobbins <784313+TheBloke@users.noreply.github.com>
|
||||||
|
Tomas <tom.tomas.36478119@gmail.com>
|
||||||
|
Tomáš Pazdiora <tomas.pazdiora@gmail.com>
|
||||||
|
Tristan Ross <rosscomputerguy@protonmail.com>
|
||||||
|
Tungsten842 <886724vf@anonaddy.me>
|
||||||
|
Tungsten842 <quantmint@protonmail.com>
|
||||||
|
Tushar <ditsuke@protonmail.com>
|
||||||
|
UEXTM.com <84163508+uextm@users.noreply.github.com>
|
||||||
|
Uzo Nweke <uzoechi@gmail.com>
|
||||||
|
Vaibhav Srivastav <vaibhavs10@gmail.com>
|
||||||
|
Val Kharitonov <mail@kharvd.com>
|
||||||
|
Valentin Konovalov <valle.ketsujin@gmail.com>
|
||||||
|
Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
|
||||||
|
Victor Z. Peng <ziliangdotme@gmail.com>
|
||||||
|
Vlad <spitfireage@gmail.com>
|
||||||
|
Vladimir <bogdad@gmail.com>
|
||||||
|
Vladimir Malyutin <first-leon@yandex.ru>
|
||||||
|
Vladimir Zorin <vladimir@deviant.guru>
|
||||||
|
Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
|
||||||
|
WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
|
||||||
|
Weird Constructor <weirdconstructor@gmail.com>
|
||||||
|
Welby Seely <welbyseely@gmail.com>
|
||||||
|
Wentai Zhang <rchardx@gmail.com>
|
||||||
|
WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
|
||||||
|
Willy Tarreau <w@1wt.eu>
|
||||||
|
Wu Jian Ping <wujjpp@hotmail.com>
|
||||||
|
Wu Jian Ping <wujp@greatld.com>
|
||||||
|
Xiake Sun <xiake.sun@intel.com>
|
||||||
|
Xiang (Kevin) Li <kevinli020508@gmail.com>
|
||||||
|
Xiao-Yong Jin <jinxiaoyong@gmail.com>
|
||||||
|
XiaotaoChen <chenxiaotao1234@gmail.com>
|
||||||
|
Xiaoyi Chen <cxychina@gmail.com>
|
||||||
|
Xingchen Song(宋星辰) <xingchensong1996@163.com>
|
||||||
|
Xuan Son Nguyen <thichthat@gmail.com>
|
||||||
|
Yann Follet <131855179+YannFollet@users.noreply.github.com>
|
||||||
|
Yiming Cui <conandiy@vip.qq.com>
|
||||||
|
Yishuo Wang <MeouSker77@outlook.com>
|
||||||
|
Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
|
||||||
|
Yui <dev@sleepyyui.com>
|
||||||
|
Yusuf Kağan Hanoğlu <hanoglu@yahoo.com>
|
||||||
|
Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com>
|
||||||
|
ZHAOKAI WANG <sanxianwei@163.com>
|
||||||
|
Zane Shannon <z@zcs.me>
|
||||||
|
Zay <95888118+isaiahbjork@users.noreply.github.com>
|
||||||
|
Zenix <zenixls2@gmail.com>
|
||||||
|
Zhang Peiyuan <a1286225768@gmail.com>
|
||||||
|
ZhouYuChen <zhouyuchen@naver.com>
|
||||||
|
Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
|
||||||
|
Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
|
||||||
|
Zsapi <martin1.zsapka@gmail.com>
|
||||||
|
a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com>
|
||||||
|
adel boussaken <netdur@gmail.com>
|
||||||
|
afrideva <95653597+afrideva@users.noreply.github.com>
|
||||||
|
akawrykow <142945436+akawrykow@users.noreply.github.com>
|
||||||
|
alexpinel <93524949+alexpinel@users.noreply.github.com>
|
||||||
|
alonfaraj <alonfaraj@gmail.com>
|
||||||
|
andrijdavid <david@geek.mg>
|
||||||
|
anon998 <131767832+anon998@users.noreply.github.com>
|
||||||
|
anzz1 <anzz1@live.com>
|
||||||
|
apaz <aarpazdera@gmail.com>
|
||||||
|
apcameron <37645737+apcameron@users.noreply.github.com>
|
||||||
|
arcrank <arcrank@gmail.com>
|
||||||
|
arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
|
||||||
|
at8u <129688334+at8u@users.noreply.github.com>
|
||||||
|
automaticcat <daogiatuank54@gmail.com>
|
||||||
|
bandoti <141645996+bandoti@users.noreply.github.com>
|
||||||
|
beiller <beiller@gmail.com>
|
||||||
|
bhubbb <79117352+bhubbb@users.noreply.github.com>
|
||||||
|
bmwl <brian.marshall@tolko.com>
|
||||||
|
bobqianic <129547291+bobqianic@users.noreply.github.com>
|
||||||
|
bryanSwk <93190252+bryanSwk@users.noreply.github.com>
|
||||||
|
bsilvereagle <bsilvereagle@users.noreply.github.com>
|
||||||
|
bssrdf <merlintiger@hotmail.com>
|
||||||
|
byte-6174 <88070277+byte-6174@users.noreply.github.com>
|
||||||
|
cebtenzzre <cebtenzzre@gmail.com>
|
||||||
|
chaihahaha <chai836275709@gmail.com>
|
||||||
|
chiranko <96988916+chiranko@users.noreply.github.com>
|
||||||
|
clibdev <52199778+clibdev@users.noreply.github.com>
|
||||||
|
clyang <clyang@clyang.net>
|
||||||
|
cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com>
|
||||||
|
coezbek <c.oezbek@gmail.com>
|
||||||
|
comex <comexk@gmail.com>
|
||||||
|
compilade <113953597+compilade@users.noreply.github.com>
|
||||||
|
crasm <crasm@git.vczf.net>
|
||||||
|
crasm <crasm@git.vczf.us>
|
||||||
|
daboe01 <daboe01@googlemail.com>
|
||||||
|
david raistrick <keen99@users.noreply.github.com>
|
||||||
|
ddpasa <112642920+ddpasa@users.noreply.github.com>
|
||||||
|
deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
|
||||||
|
divinity76 <divinity76@gmail.com>
|
||||||
|
dotpy314 <33351922+dotpy314@users.noreply.github.com>
|
||||||
|
drbh <david.richard.holtz@gmail.com>
|
||||||
|
ds5t5 <145942675+ds5t5@users.noreply.github.com>
|
||||||
|
dylan <canardleteer@users.noreply.github.com>
|
||||||
|
eastriver <lee@eastriver.dev>
|
||||||
|
ebraminio <ebraminio@gmail.com>
|
||||||
|
eiery <19350831+eiery@users.noreply.github.com>
|
||||||
|
eric8607242 <e0928021388@gmail.com>
|
||||||
|
fraxy-v <65565042+fraxy-v@users.noreply.github.com>
|
||||||
|
github-actions[bot] <github-actions[bot]@users.noreply.github.com>
|
||||||
|
gliptic <gliptic@users.noreply.github.com>
|
||||||
|
goerch <jhr.walter@t-online.de>
|
||||||
|
grahameth <96447521+grahameth@users.noreply.github.com>
|
||||||
|
gwjr <502526+gwjr@users.noreply.github.com>
|
||||||
|
h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
|
||||||
|
hankcs <cnhankmc@gmail.com>
|
||||||
|
hoangmit <hoangmit@users.noreply.github.com>
|
||||||
|
hongbo.mo <352280764@qq.com>
|
||||||
|
howlger <eclipse@voormann.de>
|
||||||
|
howlger <github@voormann.de>
|
||||||
|
hutli <6594598+hutli@users.noreply.github.com>
|
||||||
|
hutli <hutli@hutli.hu>
|
||||||
|
hutli <jensstaermose@hotmail.com>
|
||||||
|
hxer7963 <hxer7963@gmail.com>
|
||||||
|
hydai <z54981220@gmail.com>
|
||||||
|
iSma <ismail.senhaji@gmail.com>
|
||||||
|
iacore <74560659+iacore@users.noreply.github.com>
|
||||||
|
igarnier <igarnier@protonmail.com>
|
||||||
|
iohub <rickyang.pro@gmail.com>
|
||||||
|
jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
|
||||||
|
jameswu2014 <545426914@qq.com>
|
||||||
|
jneem <joeneeman@gmail.com>
|
||||||
|
johnson442 <56517414+johnson442@users.noreply.github.com>
|
||||||
|
jon-chuang <9093549+jon-chuang@users.noreply.github.com>
|
||||||
|
jp-x-g <jpxg-dev@protonmail.com>
|
||||||
|
jwj7140 <32943891+jwj7140@users.noreply.github.com>
|
||||||
|
kaizau <kaizau@users.noreply.github.com>
|
||||||
|
kalomaze <66376113+kalomaze@users.noreply.github.com>
|
||||||
|
kang <tpdns9032100@gmail.com>
|
||||||
|
katsu560 <118887472+katsu560@users.noreply.github.com>
|
||||||
|
kchro3 <62481661+kchro3@users.noreply.github.com>
|
||||||
|
khimaros <me@khimaros.com>
|
||||||
|
kiltyj <kiltyj@gmail.com>
|
||||||
|
klosax <131523366+klosax@users.noreply.github.com>
|
||||||
|
kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
|
||||||
|
kunnis <kunnis@users.noreply.github.com>
|
||||||
|
kuronekosaiko <EvanChanJ@163.com>
|
||||||
|
kuvaus <22169537+kuvaus@users.noreply.github.com>
|
||||||
|
kwin1412 <42286931+kwin1412@users.noreply.github.com>
|
||||||
|
l3utterfly <gc.pthzfoldr@gmail.com>
|
||||||
|
ldwang <ftgreat@163.com>
|
||||||
|
le.chang <cljs118@126.com>
|
||||||
|
leejet <leejet714@gmail.com>
|
||||||
|
limitedAtonement <limitedAtonement@users.noreply.github.com>
|
||||||
|
lon <114724657+longregen@users.noreply.github.com>
|
||||||
|
m3ndax <adrian.goessl@outlook.com>
|
||||||
|
maddes8cht <55592906+maddes8cht@users.noreply.github.com>
|
||||||
|
makomk <makosoft@googlemail.com>
|
||||||
|
manikbhandari <mbbhandarimanik2@gmail.com>
|
||||||
|
mdrokz <mohammadmunshi@gmail.com>
|
||||||
|
mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
|
||||||
|
minarchist <minarchist@users.noreply.github.com>
|
||||||
|
mj-shifu <77107165+mj-shifu@users.noreply.github.com>
|
||||||
|
mmyjona <jonathan.gonse@gmail.com>
|
||||||
|
momonga <115213907+mmnga@users.noreply.github.com>
|
||||||
|
moritzbrantner <31051084+moritzbrantner@users.noreply.github.com>
|
||||||
|
mzcu <milos.cubrilo@gmail.com>
|
||||||
|
nanahi <130121847+na-na-hi@users.noreply.github.com>
|
||||||
|
ngc92 <7938269+ngc92@users.noreply.github.com>
|
||||||
|
nhamanasu <45545786+nhamanasu@users.noreply.github.com>
|
||||||
|
niansa/tuxifan <anton-sa@web.de>
|
||||||
|
niansa/tuxifan <tuxifan@posteo.de>
|
||||||
|
ningshanwutuobang <ningshanwutuobang@gmail.com>
|
||||||
|
nold <Nold360@users.noreply.github.com>
|
||||||
|
nopperl <54780682+nopperl@users.noreply.github.com>
|
||||||
|
nusu-github <29514220+nusu-github@users.noreply.github.com>
|
||||||
|
olexiyb <olexiyb@gmail.com>
|
||||||
|
oobabooga <112222186+oobabooga@users.noreply.github.com>
|
||||||
|
opparco <parco.opaai@gmail.com>
|
||||||
|
ostix360 <55257054+ostix360@users.noreply.github.com>
|
||||||
|
perserk <perserk@gmail.com>
|
||||||
|
postmasters <namnguyen@google.com>
|
||||||
|
pudepiedj <pudepiedj@gmail.com>
|
||||||
|
qingfengfenga <41416092+qingfengfenga@users.noreply.github.com>
|
||||||
|
qouoq <qouoq@fastmail.com>
|
||||||
|
qunash <anzoria@gmail.com>
|
||||||
|
rabidcopy <rabidcopy@yahoo.com>
|
||||||
|
rankaiyx <rankaiyx@rankaiyx.com>
|
||||||
|
rhjdvsgsgks <26178113+rhjdvsgsgks@users.noreply.github.com>
|
||||||
|
rhuddleston <ryan.huddleston@percona.com>
|
||||||
|
rimoliga <53384203+rimoliga@users.noreply.github.com>
|
||||||
|
runfuture <runfuture@users.noreply.github.com>
|
||||||
|
sandyiscool <sandyiscool@gmail.com>
|
||||||
|
semidark <me@semidark.net>
|
||||||
|
sharpHL <132747147+sharpHL@users.noreply.github.com>
|
||||||
|
shibe2 <shibe@tuta.io>
|
||||||
|
singularity <12184989+singularity-s0@users.noreply.github.com>
|
||||||
|
sjinzh <sjinzh@gmail.com>
|
||||||
|
slaren <2141330+slaren@users.noreply.github.com>
|
||||||
|
slaren <slarengh@gmail.com>
|
||||||
|
snadampal <87143774+snadampal@users.noreply.github.com>
|
||||||
|
staviq <staviq@gmail.com>
|
||||||
|
stduhpf <stephduh@live.fr>
|
||||||
|
swittk <switt1995@gmail.com>
|
||||||
|
takov751 <40316768+takov751@users.noreply.github.com>
|
||||||
|
tarcey <cey.tarik@gmail.com>
|
||||||
|
texmex76 <40733439+texmex76@users.noreply.github.com>
|
||||||
|
thement <40525767+thement@users.noreply.github.com>
|
||||||
|
tjohnman <tjohnman@users.noreply.github.com>
|
||||||
|
tslmy <tslmy@users.noreply.github.com>
|
||||||
|
ubik2 <ubik2@users.noreply.github.com>
|
||||||
|
uint256_t <konndennsa@gmail.com>
|
||||||
|
uint256_t <maekawatoshiki1017@gmail.com>
|
||||||
|
unbounded <haakon@likedan.net>
|
||||||
|
valiray <133289098+valiray@users.noreply.github.com>
|
||||||
|
vodkaslime <646329483@qq.com>
|
||||||
|
vvhg1 <94630311+vvhg1@users.noreply.github.com>
|
||||||
|
vxiiduu <73044267+vxiiduu@users.noreply.github.com>
|
||||||
|
wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
|
||||||
|
whoreson <139810751+whoreson@users.noreply.github.com>
|
||||||
|
wonjun Jang <strutive07@gmail.com>
|
||||||
|
wzy <32936898+Freed-Wu@users.noreply.github.com>
|
||||||
|
xaedes <xaedes@gmail.com>
|
||||||
|
xaedes <xaedes@googlemail.com>
|
||||||
|
xloem <0xloem@gmail.com>
|
||||||
|
yangli2 <yangli2@gmail.com>
|
||||||
|
yuiseki <yuiseki@gmail.com>
|
||||||
|
zakkor <edward.partenie@gmail.com>
|
||||||
|
zhouwg <6889919+zhouwg@users.noreply.github.com>
|
||||||
|
zrm <trustiosity.zrm@gmail.com>
|
||||||
|
源文雨 <41315874+fumiama@users.noreply.github.com>
|
||||||
|
Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
|
|
@ -17,6 +17,7 @@
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
|
#include <codecvt>
|
||||||
|
|
||||||
#if defined(__APPLE__) && defined(__MACH__)
|
#if defined(__APPLE__) && defined(__MACH__)
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
|
@ -28,7 +29,6 @@
|
||||||
#ifndef NOMINMAX
|
#ifndef NOMINMAX
|
||||||
# define NOMINMAX
|
# define NOMINMAX
|
||||||
#endif
|
#endif
|
||||||
#include <codecvt>
|
|
||||||
#include <locale>
|
#include <locale>
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
|
@ -1501,6 +1501,77 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
|
||||||
GGML_UNREACHABLE();
|
GGML_UNREACHABLE();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Validate if a filename is safe to use
|
||||||
|
// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
|
||||||
|
bool validate_file_name(const std::string & filename) {
|
||||||
|
if (!filename.length()) {
|
||||||
|
// Empty filename invalid
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (filename.length() > 255) {
|
||||||
|
// Limit at common largest possible filename on Linux filesystems
|
||||||
|
// to avoid unnecessary further validation
|
||||||
|
// (On systems with smaller limits it will be caught by the OS)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::u32string filename_utf32;
|
||||||
|
try {
|
||||||
|
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
|
||||||
|
filename_utf32 = converter.from_bytes(filename);
|
||||||
|
|
||||||
|
// If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
|
||||||
|
// or invalid encodings were encountered. Reject such attempts
|
||||||
|
std::string filename_reencoded = converter.to_bytes(filename_utf32);
|
||||||
|
if (filename_reencoded != filename) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} catch (const std::exception &) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for forbidden codepoints:
|
||||||
|
// - Control characters
|
||||||
|
// - Unicode equivalents of illegal characters
|
||||||
|
// - UTF-16 surrogate pairs
|
||||||
|
// - UTF-8 replacement character
|
||||||
|
// - Byte order mark (BOM)
|
||||||
|
// - Illegal characters: / \ : * ? " < > |
|
||||||
|
for (char32_t c : filename_utf32) {
|
||||||
|
if (c <= 0x1F // Control characters (C0)
|
||||||
|
|| c == 0x7F // Control characters (DEL)
|
||||||
|
|| (c >= 0x80 && c <= 0x9F) // Control characters (C1)
|
||||||
|
|| c == 0xFF0E // Fullwidth Full Stop (period equivalent)
|
||||||
|
|| c == 0x2215 // Division Slash (forward slash equivalent)
|
||||||
|
|| c == 0x2216 // Set Minus (backslash equivalent)
|
||||||
|
|| (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
|
||||||
|
|| c == 0xFFFD // Replacement Character (UTF-8)
|
||||||
|
|| c == 0xFEFF // Byte Order Mark (BOM)
|
||||||
|
|| c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters
|
||||||
|
|| c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
|
||||||
|
// Unicode and other whitespace is not affected, only 0x20 space
|
||||||
|
if (filename.front() == ' ' || filename.back() == ' ' || filename.back() == '.') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reject any ".." (currently stricter than necessary, it should be fine to just check for == ".." instead)
|
||||||
|
if (filename.find("..") != std::string::npos) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reject "."
|
||||||
|
if (filename == ".") {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// String utils
|
// String utils
|
||||||
//
|
//
|
||||||
|
|
|
@ -195,6 +195,8 @@ std::string gpt_random_prompt(std::mt19937 & rng);
|
||||||
|
|
||||||
void process_escapes(std::string& input);
|
void process_escapes(std::string& input);
|
||||||
|
|
||||||
|
bool validate_file_name(const std::string & filename);
|
||||||
|
|
||||||
//
|
//
|
||||||
// String utils
|
// String utils
|
||||||
//
|
//
|
||||||
|
|
|
@ -130,7 +130,7 @@ llama_token llama_sampling_sample(
|
||||||
struct llama_sampling_context * ctx_sampling,
|
struct llama_sampling_context * ctx_sampling,
|
||||||
struct llama_context * ctx_main,
|
struct llama_context * ctx_main,
|
||||||
struct llama_context * ctx_cfg,
|
struct llama_context * ctx_cfg,
|
||||||
int idx = 0);
|
int idx = -1);
|
||||||
|
|
||||||
// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
|
// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
|
||||||
llama_token_data_array llama_sampling_prepare(
|
llama_token_data_array llama_sampling_prepare(
|
||||||
|
|
|
@ -160,7 +160,7 @@ class Model(ABC):
|
||||||
data = data.astype(np.float32)
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||||
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
|
||||||
data = data.astype(np.float32)
|
data = data.astype(np.float32)
|
||||||
|
|
||||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
|
|
|
@ -139,7 +139,8 @@ class GGMLFileType(enum.IntEnum):
|
||||||
dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
|
dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
|
||||||
if dt is None:
|
if dt is None:
|
||||||
raise ValueError(self)
|
raise ValueError(self)
|
||||||
# 1D tensors are always F32.
|
# Convert all 1D tensors to F32. Most of the codebase that takes in 1D tensors only handles F32 tensors, and most of the outputs tensors are F32.
|
||||||
|
# Also The 1d tensors aren't much of a performance/size issue. So instead of having to have separate F32 and F16 implementations of both, just convert everything to F32 for now.
|
||||||
return dt if len(tensor.shape) > 1 else DT_F32
|
return dt if len(tensor.shape) > 1 else DT_F32
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -236,7 +236,7 @@ int main(int argc, char ** argv) {
|
||||||
// The file exists and is not empty
|
// The file exists and is not empty
|
||||||
session_tokens.resize(n_ctx);
|
session_tokens.resize(n_ctx);
|
||||||
size_t n_token_count_out = 0;
|
size_t n_token_count_out = 0;
|
||||||
if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
|
if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
|
||||||
LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
|
LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -694,7 +694,7 @@ int main(int argc, char ** argv) {
|
||||||
// optionally save the session on first sample (for faster prompt loading next time)
|
// optionally save the session on first sample (for faster prompt loading next time)
|
||||||
if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
|
if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
|
||||||
need_to_save_session = false;
|
need_to_save_session = false;
|
||||||
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||||
|
|
||||||
LOG("saved session to %s\n", path_session.c_str());
|
LOG("saved session to %s\n", path_session.c_str());
|
||||||
}
|
}
|
||||||
|
@ -936,7 +936,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
|
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
|
||||||
LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
|
LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
|
||||||
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
|
|
|
@ -25,6 +25,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
std::string result0;
|
std::string result0;
|
||||||
std::string result1;
|
std::string result1;
|
||||||
|
std::string result2;
|
||||||
|
|
||||||
// init
|
// init
|
||||||
llama_model * model;
|
llama_model * model;
|
||||||
|
@ -45,8 +46,8 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// save state (rng, logits, embedding and kv_cache) to file
|
// save state (rng, logits, embedding and kv_cache) to file
|
||||||
{
|
{
|
||||||
std::vector<uint8_t> state_mem(llama_get_state_size(ctx));
|
std::vector<uint8_t> state_mem(llama_state_get_size(ctx));
|
||||||
const size_t written = llama_copy_state_data(ctx, state_mem.data());
|
const size_t written = llama_state_get_data(ctx, state_mem.data());
|
||||||
|
|
||||||
FILE *fp_write = fopen("dump_state.bin", "wb");
|
FILE *fp_write = fopen("dump_state.bin", "wb");
|
||||||
fwrite(state_mem.data(), 1, written, fp_write);
|
fwrite(state_mem.data(), 1, written, fp_write);
|
||||||
|
@ -98,13 +99,13 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// load state (rng, logits, embedding and kv_cache) from file
|
// load state (rng, logits, embedding and kv_cache) from file
|
||||||
{
|
{
|
||||||
std::vector<uint8_t> state_mem(llama_get_state_size(ctx2));
|
std::vector<uint8_t> state_mem(llama_state_get_size(ctx2));
|
||||||
|
|
||||||
FILE * fp_read = fopen("dump_state.bin", "rb");
|
FILE * fp_read = fopen("dump_state.bin", "rb");
|
||||||
const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
|
const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
|
||||||
fclose(fp_read);
|
fclose(fp_read);
|
||||||
|
|
||||||
if (read != llama_set_state_data(ctx2, state_mem.data())) {
|
if (read != llama_state_set_data(ctx2, state_mem.data())) {
|
||||||
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
||||||
llama_free(ctx2);
|
llama_free(ctx2);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
@ -142,16 +143,104 @@ int main(int argc, char ** argv) {
|
||||||
n_past += 1;
|
n_past += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("\n");
|
printf("\n\n");
|
||||||
|
|
||||||
llama_free(ctx2);
|
llama_free(ctx2);
|
||||||
llama_free_model(model);
|
|
||||||
|
|
||||||
if (result0 != result1) {
|
if (result0 != result1) {
|
||||||
fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
|
fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// make new context
|
||||||
|
auto* ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
|
||||||
|
|
||||||
|
printf("\nsingle seq run: %s", params.prompt.c_str());
|
||||||
|
|
||||||
|
// load state (rng, logits, embedding and kv_cache) from file
|
||||||
|
{
|
||||||
|
std::vector<uint8_t> state_mem(llama_state_get_size(ctx3));
|
||||||
|
|
||||||
|
FILE * fp_read = fopen("dump_state.bin", "rb");
|
||||||
|
const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
|
||||||
|
fclose(fp_read);
|
||||||
|
|
||||||
|
if (read != llama_state_set_data(ctx3, state_mem.data())) {
|
||||||
|
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
||||||
|
llama_free(ctx3);
|
||||||
|
llama_free_model(model);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
// restore state (last tokens)
|
||||||
|
n_past = n_past_saved;
|
||||||
|
|
||||||
|
// save seq 0 and load into seq 1
|
||||||
|
{
|
||||||
|
// save kv of seq 0
|
||||||
|
std::vector<uint8_t> seq_store(llama_state_seq_get_size(ctx3, 0));
|
||||||
|
const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), 0);
|
||||||
|
if (ncopy != seq_store.size()) {
|
||||||
|
fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
|
||||||
|
llama_free(ctx3);
|
||||||
|
llama_free_model(model);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
|
||||||
|
|
||||||
|
// erase whole kv
|
||||||
|
llama_kv_cache_clear(ctx3);
|
||||||
|
fprintf(stderr, "%s : kv cache cleared\n", __func__);
|
||||||
|
|
||||||
|
// restore kv into seq 1
|
||||||
|
const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), 1);
|
||||||
|
if (nset != seq_store.size()) {
|
||||||
|
fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
|
||||||
|
llama_free(ctx3);
|
||||||
|
llama_free_model(model);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
|
||||||
|
}
|
||||||
|
|
||||||
|
// third run with seq 1 instead of 0
|
||||||
|
for (auto i = 0; i < params.n_predict; i++) {
|
||||||
|
auto * logits = llama_get_logits(ctx3);
|
||||||
|
auto n_vocab = llama_n_vocab(model);
|
||||||
|
std::vector<llama_token_data> candidates;
|
||||||
|
candidates.reserve(n_vocab);
|
||||||
|
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||||
|
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||||
|
}
|
||||||
|
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||||
|
auto next_token = llama_sample_token(ctx3, &candidates_p);
|
||||||
|
auto next_token_str = llama_token_to_piece(ctx3, next_token);
|
||||||
|
|
||||||
|
printf("%s", next_token_str.c_str());
|
||||||
|
result2 += next_token_str;
|
||||||
|
|
||||||
|
if (llama_decode(ctx3, llama_batch_get_one(&next_token, 1, n_past, 1))) {
|
||||||
|
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||||
|
llama_free(ctx3);
|
||||||
|
llama_free_model(model);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
n_past += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
llama_free(ctx3);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
|
if (result0 != result2) {
|
||||||
|
fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
fprintf(stderr, "\n%s : success\n", __func__);
|
fprintf(stderr, "\n%s : success\n", __func__);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -57,6 +57,7 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
|
||||||
- `-n N, --n-predict N`: Set the maximum tokens to predict. Default: `-1`
|
- `-n N, --n-predict N`: Set the maximum tokens to predict. Default: `-1`
|
||||||
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
|
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
|
||||||
- `--metrics`: enable prometheus `/metrics` compatible endpoint. Default: disabled
|
- `--metrics`: enable prometheus `/metrics` compatible endpoint. Default: disabled
|
||||||
|
- `--slot-save-path PATH`: Specifies the path where the state of slots (the prompt cache) can be stored. If not provided, the slot management endpoints will be disabled.
|
||||||
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name. Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name. Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
||||||
- `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled
|
- `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled
|
||||||
- `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json`
|
- `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json`
|
||||||
|
@ -517,6 +518,57 @@ Available metrics:
|
||||||
- `llamacpp:requests_processing`: Number of requests processing.
|
- `llamacpp:requests_processing`: Number of requests processing.
|
||||||
- `llamacpp:requests_deferred`: Number of requests deferred.
|
- `llamacpp:requests_deferred`: Number of requests deferred.
|
||||||
|
|
||||||
|
- **POST** `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.
|
||||||
|
|
||||||
|
*Options:*
|
||||||
|
|
||||||
|
`filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter.
|
||||||
|
|
||||||
|
### Result JSON
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"id_slot": 0,
|
||||||
|
"filename": "slot_save_file.bin",
|
||||||
|
"n_saved": 1745,
|
||||||
|
"n_written": 14309796,
|
||||||
|
"timings": {
|
||||||
|
"save_ms": 49.865
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
- **POST** `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file.
|
||||||
|
|
||||||
|
*Options:*
|
||||||
|
|
||||||
|
`filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter.
|
||||||
|
|
||||||
|
### Result JSON
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"id_slot": 0,
|
||||||
|
"filename": "slot_save_file.bin",
|
||||||
|
"n_restored": 1745,
|
||||||
|
"n_read": 14309796,
|
||||||
|
"timings": {
|
||||||
|
"restore_ms": 42.937
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
- **POST** `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot.
|
||||||
|
|
||||||
|
### Result JSON
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"id_slot": 0,
|
||||||
|
"n_erased": 1745
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
## More examples
|
## More examples
|
||||||
|
|
||||||
### Change system prompt on runtime
|
### Change system prompt on runtime
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -406,7 +406,7 @@
|
||||||
throw new Error("already running");
|
throw new Error("already running");
|
||||||
}
|
}
|
||||||
controller.value = new AbortController();
|
controller.value = new AbortController();
|
||||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: document.baseURI.replace(/\/+$/, '') })) {
|
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: location.pathname.replace(/\/+$/, '') })) {
|
||||||
const data = chunk.data;
|
const data = chunk.data;
|
||||||
|
|
||||||
if (data.stop) {
|
if (data.stop) {
|
||||||
|
@ -1015,6 +1015,10 @@
|
||||||
}
|
}
|
||||||
|
|
||||||
function App(props) {
|
function App(props) {
|
||||||
|
useEffect(() => {
|
||||||
|
const query = new URLSearchParams(location.search).get("q");
|
||||||
|
if (query) chat(query);
|
||||||
|
}, []);
|
||||||
|
|
||||||
return html`
|
return html`
|
||||||
<div class="mode-${session.value.type}">
|
<div class="mode-${session.value.type}">
|
||||||
|
|
|
@ -62,7 +62,10 @@ enum server_task_type {
|
||||||
SERVER_TASK_TYPE_COMPLETION,
|
SERVER_TASK_TYPE_COMPLETION,
|
||||||
SERVER_TASK_TYPE_CANCEL,
|
SERVER_TASK_TYPE_CANCEL,
|
||||||
SERVER_TASK_TYPE_NEXT_RESPONSE,
|
SERVER_TASK_TYPE_NEXT_RESPONSE,
|
||||||
SERVER_TASK_TYPE_METRICS
|
SERVER_TASK_TYPE_METRICS,
|
||||||
|
SERVER_TASK_TYPE_SLOT_SAVE,
|
||||||
|
SERVER_TASK_TYPE_SLOT_RESTORE,
|
||||||
|
SERVER_TASK_TYPE_SLOT_ERASE,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct server_task {
|
struct server_task {
|
||||||
|
@ -129,6 +132,7 @@ struct server_params {
|
||||||
|
|
||||||
bool slots_endpoint = true;
|
bool slots_endpoint = true;
|
||||||
bool metrics_endpoint = false;
|
bool metrics_endpoint = false;
|
||||||
|
std::string slot_save_path;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct server_slot {
|
struct server_slot {
|
||||||
|
@ -1613,6 +1617,107 @@ struct server_context {
|
||||||
}
|
}
|
||||||
queue_results.send(res);
|
queue_results.send(res);
|
||||||
} break;
|
} break;
|
||||||
|
case SERVER_TASK_TYPE_SLOT_SAVE:
|
||||||
|
{
|
||||||
|
int id_slot = task.data["id_slot"];
|
||||||
|
server_slot * slot = get_slot(id_slot);
|
||||||
|
if (slot == nullptr) {
|
||||||
|
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t token_count = slot->cache_tokens.size();
|
||||||
|
const int64_t t_start = ggml_time_us();
|
||||||
|
|
||||||
|
std::string filename = task.data["filename"];
|
||||||
|
std::string filepath = task.data["filepath"];
|
||||||
|
|
||||||
|
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count);
|
||||||
|
|
||||||
|
const int64_t t_end = ggml_time_us();
|
||||||
|
const double t_save_ms = (t_end - t_start) / 1000.0;
|
||||||
|
|
||||||
|
server_task_result result;
|
||||||
|
result.id = task.id;
|
||||||
|
result.stop = true;
|
||||||
|
result.error = false;
|
||||||
|
result.data = json {
|
||||||
|
{ "id_slot", id_slot },
|
||||||
|
{ "filename", filename },
|
||||||
|
{ "n_saved", token_count }, // tokens saved
|
||||||
|
{ "n_written", nwrite }, // bytes written
|
||||||
|
{ "timings", {
|
||||||
|
{ "save_ms", t_save_ms }
|
||||||
|
} }
|
||||||
|
};
|
||||||
|
queue_results.send(result);
|
||||||
|
} break;
|
||||||
|
case SERVER_TASK_TYPE_SLOT_RESTORE:
|
||||||
|
{
|
||||||
|
int id_slot = task.data["id_slot"];
|
||||||
|
server_slot * slot = get_slot(id_slot);
|
||||||
|
if (slot == nullptr) {
|
||||||
|
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t t_start = ggml_time_us();
|
||||||
|
|
||||||
|
std::string filename = task.data["filename"];
|
||||||
|
std::string filepath = task.data["filepath"];
|
||||||
|
|
||||||
|
slot->cache_tokens.resize(slot->n_ctx);
|
||||||
|
size_t token_count = 0;
|
||||||
|
size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count);
|
||||||
|
if (nread == 0) {
|
||||||
|
slot->cache_tokens.resize(0);
|
||||||
|
send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
slot->cache_tokens.resize(token_count);
|
||||||
|
|
||||||
|
const int64_t t_end = ggml_time_us();
|
||||||
|
const double t_restore_ms = (t_end - t_start) / 1000.0;
|
||||||
|
|
||||||
|
server_task_result result;
|
||||||
|
result.id = task.id;
|
||||||
|
result.stop = true;
|
||||||
|
result.error = false;
|
||||||
|
result.data = json {
|
||||||
|
{ "id_slot", id_slot },
|
||||||
|
{ "filename", filename },
|
||||||
|
{ "n_restored", token_count }, // tokens restored
|
||||||
|
{ "n_read", nread }, // bytes read
|
||||||
|
{ "timings", {
|
||||||
|
{ "restore_ms", t_restore_ms }
|
||||||
|
} }
|
||||||
|
};
|
||||||
|
queue_results.send(result);
|
||||||
|
} break;
|
||||||
|
case SERVER_TASK_TYPE_SLOT_ERASE:
|
||||||
|
{
|
||||||
|
int id_slot = task.data["id_slot"];
|
||||||
|
server_slot * slot = get_slot(id_slot);
|
||||||
|
if (slot == nullptr) {
|
||||||
|
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Erase token cache
|
||||||
|
const size_t n_erased = slot->cache_tokens.size();
|
||||||
|
llama_kv_cache_seq_rm(ctx, slot->id + 1, -1, -1);
|
||||||
|
slot->cache_tokens.clear();
|
||||||
|
|
||||||
|
server_task_result result;
|
||||||
|
result.id = task.id;
|
||||||
|
result.stop = true;
|
||||||
|
result.error = false;
|
||||||
|
result.data = json {
|
||||||
|
{ "id_slot", id_slot },
|
||||||
|
{ "n_erased", n_erased }
|
||||||
|
};
|
||||||
|
queue_results.send(result);
|
||||||
|
} break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2250,6 +2355,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
|
||||||
printf(" --log-disable disables logging to a file.\n");
|
printf(" --log-disable disables logging to a file.\n");
|
||||||
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
|
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
|
||||||
printf(" --metrics enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled");
|
printf(" --metrics enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled");
|
||||||
|
printf(" --slot-save-path PATH path to save slot kv cache (default: disabled)\n");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
|
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
|
||||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||||
|
@ -2658,6 +2764,16 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
||||||
sparams.slots_endpoint = false;
|
sparams.slots_endpoint = false;
|
||||||
} else if (arg == "--metrics") {
|
} else if (arg == "--metrics") {
|
||||||
sparams.metrics_endpoint = true;
|
sparams.metrics_endpoint = true;
|
||||||
|
} else if (arg == "--slot-save-path") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
sparams.slot_save_path = argv[i];
|
||||||
|
// if doesn't end with DIRECTORY_SEPARATOR, add it
|
||||||
|
if (!sparams.slot_save_path.empty() && sparams.slot_save_path[sparams.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
|
||||||
|
sparams.slot_save_path += DIRECTORY_SEPARATOR;
|
||||||
|
}
|
||||||
} else if (arg == "--chat-template") {
|
} else if (arg == "--chat-template") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -3160,6 +3276,112 @@ int main(int argc, char ** argv) {
|
||||||
res.status = 200; // HTTP OK
|
res.status = 200; // HTTP OK
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
||||||
|
json request_data = json::parse(req.body);
|
||||||
|
std::string filename = request_data["filename"];
|
||||||
|
if (!validate_file_name(filename)) {
|
||||||
|
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
std::string filepath = sparams.slot_save_path + filename;
|
||||||
|
|
||||||
|
server_task task;
|
||||||
|
task.type = SERVER_TASK_TYPE_SLOT_SAVE;
|
||||||
|
task.data = {
|
||||||
|
{ "id_slot", id_slot },
|
||||||
|
{ "filename", filename },
|
||||||
|
{ "filepath", filepath }
|
||||||
|
};
|
||||||
|
|
||||||
|
const int id_task = ctx_server.queue_tasks.post(task);
|
||||||
|
ctx_server.queue_results.add_waiting_task_id(id_task);
|
||||||
|
|
||||||
|
server_task_result result = ctx_server.queue_results.recv(id_task);
|
||||||
|
ctx_server.queue_results.remove_waiting_task_id(id_task);
|
||||||
|
|
||||||
|
if (result.error) {
|
||||||
|
res_error(res, result.data);
|
||||||
|
} else {
|
||||||
|
res.set_content(result.data.dump(), "application/json");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
||||||
|
json request_data = json::parse(req.body);
|
||||||
|
std::string filename = request_data["filename"];
|
||||||
|
if (!validate_file_name(filename)) {
|
||||||
|
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
std::string filepath = sparams.slot_save_path + filename;
|
||||||
|
|
||||||
|
server_task task;
|
||||||
|
task.type = SERVER_TASK_TYPE_SLOT_RESTORE;
|
||||||
|
task.data = {
|
||||||
|
{ "id_slot", id_slot },
|
||||||
|
{ "filename", filename },
|
||||||
|
{ "filepath", filepath }
|
||||||
|
};
|
||||||
|
|
||||||
|
const int id_task = ctx_server.queue_tasks.post(task);
|
||||||
|
ctx_server.queue_results.add_waiting_task_id(id_task);
|
||||||
|
|
||||||
|
server_task_result result = ctx_server.queue_results.recv(id_task);
|
||||||
|
ctx_server.queue_results.remove_waiting_task_id(id_task);
|
||||||
|
|
||||||
|
if (result.error) {
|
||||||
|
res_error(res, result.data);
|
||||||
|
} else {
|
||||||
|
res.set_content(result.data.dump(), "application/json");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const auto handle_slots_erase = [&ctx_server, &res_error](const httplib::Request & /* req */, httplib::Response & res, int id_slot) {
|
||||||
|
server_task task;
|
||||||
|
task.type = SERVER_TASK_TYPE_SLOT_ERASE;
|
||||||
|
task.data = {
|
||||||
|
{ "id_slot", id_slot },
|
||||||
|
};
|
||||||
|
|
||||||
|
const int id_task = ctx_server.queue_tasks.post(task);
|
||||||
|
ctx_server.queue_results.add_waiting_task_id(id_task);
|
||||||
|
|
||||||
|
server_task_result result = ctx_server.queue_results.recv(id_task);
|
||||||
|
ctx_server.queue_results.remove_waiting_task_id(id_task);
|
||||||
|
|
||||||
|
if (result.error) {
|
||||||
|
res_error(res, result.data);
|
||||||
|
} else {
|
||||||
|
res.set_content(result.data.dump(), "application/json");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const auto handle_slots_action = [&res_error, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) {
|
||||||
|
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||||
|
|
||||||
|
std::string id_slot_str = req.path_params.at("id_slot");
|
||||||
|
int id_slot;
|
||||||
|
|
||||||
|
try {
|
||||||
|
id_slot = std::stoi(id_slot_str);
|
||||||
|
} catch (const std::exception &) {
|
||||||
|
res_error(res, format_error_response("Invalid slot ID", ERROR_TYPE_INVALID_REQUEST));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string action = req.get_param_value("action");
|
||||||
|
|
||||||
|
if (action == "save") {
|
||||||
|
handle_slots_save(req, res, id_slot);
|
||||||
|
} else if (action == "restore") {
|
||||||
|
handle_slots_restore(req, res, id_slot);
|
||||||
|
} else if (action == "erase") {
|
||||||
|
handle_slots_erase(req, res, id_slot);
|
||||||
|
} else {
|
||||||
|
res_error(res, format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
|
||||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||||
json data = {
|
json data = {
|
||||||
|
@ -3522,6 +3744,10 @@ int main(int argc, char ** argv) {
|
||||||
svr->Post("/v1/embeddings", handle_embeddings);
|
svr->Post("/v1/embeddings", handle_embeddings);
|
||||||
svr->Post("/tokenize", handle_tokenize);
|
svr->Post("/tokenize", handle_tokenize);
|
||||||
svr->Post("/detokenize", handle_detokenize);
|
svr->Post("/detokenize", handle_detokenize);
|
||||||
|
if (!sparams.slot_save_path.empty()) {
|
||||||
|
// only enable slot endpoints if slot_save_path is set
|
||||||
|
svr->Post("/slots/:id_slot", handle_slots_action);
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// Start the server
|
// Start the server
|
||||||
|
|
58
examples/server/tests/features/slotsave.feature
Normal file
58
examples/server/tests/features/slotsave.feature
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
@llama.cpp
|
||||||
|
@slotsave
|
||||||
|
Feature: llama.cpp server slot management
|
||||||
|
|
||||||
|
Background: Server startup
|
||||||
|
Given a server listening on localhost:8080
|
||||||
|
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
||||||
|
And prompt caching is enabled
|
||||||
|
And 2 slots
|
||||||
|
And . as slot save path
|
||||||
|
And 2048 KV cache size
|
||||||
|
And 42 as server seed
|
||||||
|
And 24 max tokens to predict
|
||||||
|
Then the server is starting
|
||||||
|
Then the server is healthy
|
||||||
|
|
||||||
|
Scenario: Save and Restore Slot
|
||||||
|
# First prompt in slot 1 should be fully processed
|
||||||
|
Given a user prompt "What is the capital of France?"
|
||||||
|
And using slot id 1
|
||||||
|
And a completion request with no api error
|
||||||
|
Then 24 tokens are predicted matching (Lily|cake)
|
||||||
|
And 22 prompt tokens are processed
|
||||||
|
When the slot 1 is saved with filename "slot1.bin"
|
||||||
|
Then the server responds with status code 200
|
||||||
|
# Since we have cache, this should only process the last tokens
|
||||||
|
Given a user prompt "What is the capital of Germany?"
|
||||||
|
And a completion request with no api error
|
||||||
|
Then 24 tokens are predicted matching (Thank|special)
|
||||||
|
And 7 prompt tokens are processed
|
||||||
|
# Loading the original cache into slot 0,
|
||||||
|
# we should only be processing 1 prompt token and get the same output
|
||||||
|
When the slot 0 is restored with filename "slot1.bin"
|
||||||
|
Then the server responds with status code 200
|
||||||
|
Given a user prompt "What is the capital of France?"
|
||||||
|
And using slot id 0
|
||||||
|
And a completion request with no api error
|
||||||
|
Then 24 tokens are predicted matching (Lily|cake)
|
||||||
|
And 1 prompt tokens are processed
|
||||||
|
# For verification that slot 1 was not corrupted during slot 0 load, same thing
|
||||||
|
Given a user prompt "What is the capital of Germany?"
|
||||||
|
And using slot id 1
|
||||||
|
And a completion request with no api error
|
||||||
|
Then 24 tokens are predicted matching (Thank|special)
|
||||||
|
And 1 prompt tokens are processed
|
||||||
|
|
||||||
|
Scenario: Erase Slot
|
||||||
|
Given a user prompt "What is the capital of France?"
|
||||||
|
And using slot id 1
|
||||||
|
And a completion request with no api error
|
||||||
|
Then 24 tokens are predicted matching (Lily|cake)
|
||||||
|
And 22 prompt tokens are processed
|
||||||
|
When the slot 1 is erased
|
||||||
|
Then the server responds with status code 200
|
||||||
|
Given a user prompt "What is the capital of France?"
|
||||||
|
And a completion request with no api error
|
||||||
|
Then 24 tokens are predicted matching (Lily|cake)
|
||||||
|
And 22 prompt tokens are processed
|
|
@ -49,6 +49,9 @@ def step_server_config(context, server_fqdn, server_port):
|
||||||
context.n_predict = None
|
context.n_predict = None
|
||||||
context.n_prompts = 0
|
context.n_prompts = 0
|
||||||
context.n_server_predict = None
|
context.n_server_predict = None
|
||||||
|
context.slot_save_path = None
|
||||||
|
context.id_slot = None
|
||||||
|
context.cache_prompt = None
|
||||||
context.n_slots = None
|
context.n_slots = None
|
||||||
context.prompt_prefix = None
|
context.prompt_prefix = None
|
||||||
context.prompt_suffix = None
|
context.prompt_suffix = None
|
||||||
|
@ -119,6 +122,21 @@ def step_server_n_predict(context, n_predict):
|
||||||
context.n_server_predict = n_predict
|
context.n_server_predict = n_predict
|
||||||
|
|
||||||
|
|
||||||
|
@step('{slot_save_path} as slot save path')
|
||||||
|
def step_slot_save_path(context, slot_save_path):
|
||||||
|
context.slot_save_path = slot_save_path
|
||||||
|
|
||||||
|
|
||||||
|
@step('using slot id {id_slot:d}')
|
||||||
|
def step_id_slot(context, id_slot):
|
||||||
|
context.id_slot = id_slot
|
||||||
|
|
||||||
|
|
||||||
|
@step('prompt caching is enabled')
|
||||||
|
def step_enable_prompt_cache(context):
|
||||||
|
context.cache_prompt = True
|
||||||
|
|
||||||
|
|
||||||
@step('continuous batching')
|
@step('continuous batching')
|
||||||
def step_server_continuous_batching(context):
|
def step_server_continuous_batching(context):
|
||||||
context.server_continuous_batching = True
|
context.server_continuous_batching = True
|
||||||
|
@ -212,6 +230,8 @@ async def step_request_completion(context, api_error):
|
||||||
context.base_url,
|
context.base_url,
|
||||||
debug=context.debug,
|
debug=context.debug,
|
||||||
n_predict=context.n_predict,
|
n_predict=context.n_predict,
|
||||||
|
cache_prompt=context.cache_prompt,
|
||||||
|
id_slot=context.id_slot,
|
||||||
seed=await completions_seed(context),
|
seed=await completions_seed(context),
|
||||||
expect_api_error=expect_api_error,
|
expect_api_error=expect_api_error,
|
||||||
user_api_key=context.user_api_key)
|
user_api_key=context.user_api_key)
|
||||||
|
@ -711,12 +731,48 @@ async def concurrent_requests(context, f_completion, *args, **kwargs):
|
||||||
await asyncio.sleep(0.1)
|
await asyncio.sleep(0.1)
|
||||||
|
|
||||||
|
|
||||||
|
@step('the slot {slot_id:d} is saved with filename "{filename}"')
|
||||||
|
@async_run_until_complete
|
||||||
|
async def step_save_slot(context, slot_id, filename):
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.post(f'{context.base_url}/slots/{slot_id}?action=save',
|
||||||
|
json={"filename": filename},
|
||||||
|
headers={"Content-Type": "application/json"}) as response:
|
||||||
|
context.response = response
|
||||||
|
|
||||||
|
|
||||||
|
@step('the slot {slot_id:d} is restored with filename "{filename}"')
|
||||||
|
@async_run_until_complete
|
||||||
|
async def step_restore_slot(context, slot_id, filename):
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.post(f'{context.base_url}/slots/{slot_id}?action=restore',
|
||||||
|
json={"filename": filename},
|
||||||
|
headers={"Content-Type": "application/json"}) as response:
|
||||||
|
context.response = response
|
||||||
|
|
||||||
|
|
||||||
|
@step('the slot {slot_id:d} is erased')
|
||||||
|
@async_run_until_complete
|
||||||
|
async def step_erase_slot(context, slot_id):
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.post(f'{context.base_url}/slots/{slot_id}?action=erase',
|
||||||
|
headers={"Content-Type": "application/json"}) as response:
|
||||||
|
context.response = response
|
||||||
|
|
||||||
|
|
||||||
|
@step('the server responds with status code {status_code:d}')
|
||||||
|
def step_server_responds_with_status_code(context, status_code):
|
||||||
|
assert context.response.status == status_code
|
||||||
|
|
||||||
|
|
||||||
async def request_completion(prompt,
|
async def request_completion(prompt,
|
||||||
base_url,
|
base_url,
|
||||||
debug=False,
|
debug=False,
|
||||||
prompt_prefix=None,
|
prompt_prefix=None,
|
||||||
prompt_suffix=None,
|
prompt_suffix=None,
|
||||||
n_predict=None,
|
n_predict=None,
|
||||||
|
cache_prompt=False,
|
||||||
|
id_slot=None,
|
||||||
seed=None,
|
seed=None,
|
||||||
expect_api_error=None,
|
expect_api_error=None,
|
||||||
user_api_key=None):
|
user_api_key=None):
|
||||||
|
@ -738,6 +794,8 @@ async def request_completion(prompt,
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
"input_suffix": prompt_suffix,
|
"input_suffix": prompt_suffix,
|
||||||
"n_predict": n_predict if n_predict is not None else -1,
|
"n_predict": n_predict if n_predict is not None else -1,
|
||||||
|
"cache_prompt": cache_prompt,
|
||||||
|
"id_slot": id_slot,
|
||||||
"seed": seed if seed is not None else 42
|
"seed": seed if seed is not None else 42
|
||||||
},
|
},
|
||||||
headers=headers,
|
headers=headers,
|
||||||
|
@ -1104,6 +1162,8 @@ def start_server_background(context):
|
||||||
server_args.extend(['--parallel', context.n_slots])
|
server_args.extend(['--parallel', context.n_slots])
|
||||||
if context.n_server_predict:
|
if context.n_server_predict:
|
||||||
server_args.extend(['--n-predict', context.n_server_predict])
|
server_args.extend(['--n-predict', context.n_server_predict])
|
||||||
|
if context.slot_save_path:
|
||||||
|
server_args.extend(['--slot-save-path', context.slot_save_path])
|
||||||
if context.server_api_key:
|
if context.server_api_key:
|
||||||
server_args.extend(['--api-key', context.server_api_key])
|
server_args.extend(['--api-key', context.server_api_key])
|
||||||
if context.n_ga:
|
if context.n_ga:
|
||||||
|
|
|
@ -1225,7 +1225,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
||||||
|
|
||||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||||
// ldc == nrows of the matrix that cuBLAS writes into
|
// ldc == nrows of the matrix that cuBLAS writes into
|
||||||
int ldc = id == ctx.device ? ne0 : row_diff;
|
int64_t ldc = id == ctx.device ? ne0 : row_diff;
|
||||||
|
|
||||||
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
||||||
|
|
||||||
|
@ -1377,8 +1377,8 @@ static void ggml_cuda_op_mul_mat(
|
||||||
const int64_t ne0 = dst->ne[0];
|
const int64_t ne0 = dst->ne[0];
|
||||||
const int64_t ne1 = dst->ne[1];
|
const int64_t ne1 = dst->ne[1];
|
||||||
|
|
||||||
const int nb2 = dst->nb[2];
|
const int64_t nb2 = dst->nb[2];
|
||||||
const int nb3 = dst->nb[3];
|
const int64_t nb3 = dst->nb[3];
|
||||||
|
|
||||||
GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
|
||||||
GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
|
||||||
|
|
|
@ -394,7 +394,7 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
||||||
// TODO: move to ggml-common.h
|
// TODO: move to ggml-common.h
|
||||||
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||||
|
|
||||||
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
||||||
|
|
||||||
|
|
||||||
//////////////////////
|
//////////////////////
|
||||||
|
|
|
@ -4,14 +4,14 @@
|
||||||
#define CUDA_Q8_0_NE_ALIGN 2048
|
#define CUDA_Q8_0_NE_ALIGN 2048
|
||||||
|
|
||||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||||
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
|
||||||
const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
|
const int64_t i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
|
||||||
|
|
||||||
if (i >= k) {
|
if (i >= k) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int ib = i/qk; // block index
|
const int64_t ib = i/qk; // block index
|
||||||
const int iqs = (i%qk)/qr; // quant index
|
const int iqs = (i%qk)/qr; // quant index
|
||||||
const int iybs = i - i%qk; // y block start index
|
const int iybs = i - i%qk; // y block start index
|
||||||
const int y_offset = qr == 1 ? 1 : qk/2;
|
const int y_offset = qr == 1 ? 1 : qk/2;
|
||||||
|
@ -25,7 +25,7 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
|
||||||
}
|
}
|
||||||
|
|
||||||
template <bool need_check>
|
template <bool need_check>
|
||||||
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
|
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) {
|
||||||
#if __CUDA_ARCH__ >= CC_PASCAL
|
#if __CUDA_ARCH__ >= CC_PASCAL
|
||||||
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
|
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
|
||||||
|
|
||||||
|
@ -68,13 +68,13 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int64_t i = blockIdx.x;
|
||||||
|
|
||||||
// assume 32 threads
|
// assume 32 threads
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int il = tid/8;
|
const int il = tid/8;
|
||||||
const int ir = tid%8;
|
const int ir = tid%8;
|
||||||
const int ib = 8*i + ir;
|
const int64_t ib = 8*i + ir;
|
||||||
if (ib >= nb32) {
|
if (ib >= nb32) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -96,13 +96,13 @@ static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int64_t i = blockIdx.x;
|
||||||
|
|
||||||
// assume 32 threads
|
// assume 32 threads
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int il = tid/8;
|
const int il = tid/8;
|
||||||
const int ir = tid%8;
|
const int ir = tid%8;
|
||||||
const int ib = 8*i + ir;
|
const int64_t ib = 8*i + ir;
|
||||||
if (ib >= nb32) {
|
if (ib >= nb32) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -313,14 +313,14 @@ template<typename dst_t>
|
||||||
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||||
const block_q6_K * x = (const block_q6_K *) vx;
|
const block_q6_K * x = (const block_q6_K *) vx;
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int64_t i = blockIdx.x;
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
|
|
||||||
// assume 64 threads - this is very slightly better than the one below
|
// assume 64 threads - this is very slightly better than the one below
|
||||||
const int tid = threadIdx.x;
|
const int64_t tid = threadIdx.x;
|
||||||
const int ip = tid/32; // ip is 0 or 1
|
const int64_t ip = tid/32; // ip is 0 or 1
|
||||||
const int il = tid - 32*ip; // 0...32
|
const int64_t il = tid - 32*ip; // 0...32
|
||||||
const int is = 8*ip + il/16;
|
const int64_t is = 8*ip + il/16;
|
||||||
|
|
||||||
dst_t * y = yy + i*QK_K + 128*ip + il;
|
dst_t * y = yy + i*QK_K + 128*ip + il;
|
||||||
|
|
||||||
|
@ -337,9 +337,9 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
|
||||||
#else
|
#else
|
||||||
|
|
||||||
// assume 32 threads
|
// assume 32 threads
|
||||||
const int tid = threadIdx.x;
|
const int64_t tid = threadIdx.x;
|
||||||
const int ip = tid/16; // 0 or 1
|
const int64_t ip = tid/16; // 0 or 1
|
||||||
const int il = tid - 16*ip; // 0...15
|
const int64_t il = tid - 16*ip; // 0...15
|
||||||
|
|
||||||
dst_t * y = yy + i*QK_K + 16*ip + il;
|
dst_t * y = yy + i*QK_K + 16*ip + il;
|
||||||
|
|
||||||
|
@ -571,12 +571,12 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||||
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||||
const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
|
const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE);
|
||||||
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
|
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||||
const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
|
const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
|
||||||
if (k % CUDA_Q8_0_NE_ALIGN == 0) {
|
if (k % CUDA_Q8_0_NE_ALIGN == 0) {
|
||||||
const bool need_check = false;
|
const bool need_check = false;
|
||||||
|
@ -588,7 +588,7 @@ static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half *
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
|
@ -598,7 +598,7 @@ static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cu
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
|
@ -608,27 +608,27 @@ static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cu
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb32 = k / 32;
|
const int nb32 = k / 32;
|
||||||
const int nb = (k + 255) / 256;
|
const int nb = (k + 255) / 256;
|
||||||
dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
dequantize_block_q4_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb32 = k / 32;
|
const int nb32 = k / 32;
|
||||||
const int nb = (k + 255) / 256;
|
const int nb = (k + 255) / 256;
|
||||||
dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_q4_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
|
@ -638,7 +638,7 @@ static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cu
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||||
|
@ -648,55 +648,55 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq2_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq2_s<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq3_s<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = (k + QK_K - 1) / QK_K;
|
const int nb = (k + QK_K - 1) / QK_K;
|
||||||
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = k / QK_K;
|
const int nb = k / QK_K;
|
||||||
dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq1_m<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename dst_t>
|
template<typename dst_t>
|
||||||
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||||
const int nb = (k + QK_K - 1) / QK_K;
|
const int nb = (k + QK_K - 1) / QK_K;
|
||||||
#if QK_K == 64
|
#if QK_K == 64
|
||||||
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
||||||
|
@ -706,8 +706,8 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k,
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename src_t, typename dst_t>
|
template <typename src_t, typename dst_t>
|
||||||
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
|
||||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
if (i >= k) {
|
if (i >= k) {
|
||||||
return;
|
return;
|
||||||
|
@ -719,7 +719,7 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename src_t, typename dst_t>
|
template <typename src_t, typename dst_t>
|
||||||
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||||
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
||||||
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
convert_unary<src_t><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream);
|
using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
|
||||||
|
|
||||||
typedef to_t_cuda_t<float> to_fp32_cuda_t;
|
typedef to_t_cuda_t<float> to_fp32_cuda_t;
|
||||||
typedef to_t_cuda_t<half> to_fp16_cuda_t;
|
typedef to_t_cuda_t<half> to_fp16_cuda_t;
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
#include "common.cuh"
|
#include "common.cuh"
|
||||||
|
|
||||||
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||||
const block_q4_0 * x = (const block_q4_0 *) vx;
|
const block_q4_0 * x = (const block_q4_0 *) vx;
|
||||||
|
|
||||||
const dfloat d = x[ib].d;
|
const dfloat d = x[ib].d;
|
||||||
|
@ -19,7 +19,7 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
|
||||||
#endif // GGML_CUDA_F16
|
#endif // GGML_CUDA_F16
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||||
const block_q4_1 * x = (const block_q4_1 *) vx;
|
const block_q4_1 * x = (const block_q4_1 *) vx;
|
||||||
|
|
||||||
const dfloat d = __low2half(x[ib].dm);
|
const dfloat d = __low2half(x[ib].dm);
|
||||||
|
@ -39,7 +39,7 @@ static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const in
|
||||||
#endif // GGML_CUDA_F16
|
#endif // GGML_CUDA_F16
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||||
const block_q5_0 * x = (const block_q5_0 *) vx;
|
const block_q5_0 * x = (const block_q5_0 *) vx;
|
||||||
|
|
||||||
const dfloat d = x[ib].d;
|
const dfloat d = x[ib].d;
|
||||||
|
@ -62,7 +62,7 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
|
||||||
#endif // GGML_CUDA_F16
|
#endif // GGML_CUDA_F16
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||||
const block_q5_1 * x = (const block_q5_1 *) vx;
|
const block_q5_1 * x = (const block_q5_1 *) vx;
|
||||||
|
|
||||||
const dfloat d = __low2half(x[ib].dm);
|
const dfloat d = __low2half(x[ib].dm);
|
||||||
|
@ -86,7 +86,7 @@ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const in
|
||||||
#endif // GGML_CUDA_F16
|
#endif // GGML_CUDA_F16
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||||
const block_q8_0 * x = (const block_q8_0 *) vx;
|
const block_q8_0 * x = (const block_q8_0 *) vx;
|
||||||
|
|
||||||
const dfloat d = x[ib].d;
|
const dfloat d = x[ib].d;
|
||||||
|
|
|
@ -565,7 +565,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||||
const half * x = (const half *) vx;
|
const half * x = (const half *) vx;
|
||||||
|
|
||||||
// automatic half -> float type cast if dfloat == float
|
// automatic half -> float type cast if dfloat == float
|
||||||
|
@ -577,7 +577,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
||||||
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
||||||
// qk = quantized weights per x block
|
// qk = quantized weights per x block
|
||||||
// qr = number of quantized weights per data value in x block
|
// qr = number of quantized weights per data value in x block
|
||||||
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
const int64_t row = (int64_t)blockIdx.x*blockDim.y + threadIdx.y;
|
||||||
|
|
||||||
if (row >= nrows) {
|
if (row >= nrows) {
|
||||||
return;
|
return;
|
||||||
|
@ -598,7 +598,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
||||||
|
|
||||||
for (int i = 0; i < ncols; i += iter_stride) {
|
for (int i = 0; i < ncols; i += iter_stride) {
|
||||||
const int col = i + vals_per_iter*tid;
|
const int col = i + vals_per_iter*tid;
|
||||||
const int ib = (row*ncols + col)/qk; // x block index
|
const int64_t ib = ((int64_t)row*ncols + col)/qk; // x block index
|
||||||
const int iqs = (col%qk)/qr; // x quant index
|
const int iqs = (col%qk)/qr; // x quant index
|
||||||
const int iybs = col - col%qk; // y block start index
|
const int iybs = col - col%qk; // y block start index
|
||||||
|
|
||||||
|
|
|
@ -1,20 +1,20 @@
|
||||||
#include "quantize.cuh"
|
#include "quantize.cuh"
|
||||||
|
|
||||||
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
|
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx_padded) {
|
||||||
const int ix = blockDim.x*blockIdx.x + threadIdx.x;
|
const int64_t ix = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
if (ix >= kx_padded) {
|
if (ix >= kx_padded) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int iy = blockDim.y*blockIdx.y + threadIdx.y;
|
const int64_t iy = (int64_t)blockDim.y*blockIdx.y + threadIdx.y;
|
||||||
|
|
||||||
const int i_padded = iy*kx_padded + ix;
|
const int64_t i_padded = (int64_t)iy*kx_padded + ix;
|
||||||
|
|
||||||
block_q8_1 * y = (block_q8_1 *) vy;
|
block_q8_1 * y = (block_q8_1 *) vy;
|
||||||
|
|
||||||
const int ib = i_padded / QK8_1; // block index
|
const int64_t ib = i_padded / QK8_1; // block index
|
||||||
const int iqs = i_padded % QK8_1; // quant index
|
const int64_t iqs = i_padded % QK8_1; // quant index
|
||||||
|
|
||||||
const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
|
const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
|
||||||
float amax = fabsf(xi);
|
float amax = fabsf(xi);
|
||||||
|
@ -36,8 +36,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
||||||
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
|
void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream) {
|
||||||
const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
const int64_t block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
||||||
const dim3 num_blocks(block_num_x, ky, 1);
|
const dim3 num_blocks(block_num_x, ky, 1);
|
||||||
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
|
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
|
||||||
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
|
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
|
||||||
|
|
|
@ -2,4 +2,4 @@
|
||||||
|
|
||||||
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
||||||
|
|
||||||
void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream);
|
void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream);
|
||||||
|
|
310
ggml-quants.c
310
ggml-quants.c
File diff suppressed because it is too large
Load diff
148
ggml-quants.h
148
ggml-quants.h
|
@ -12,70 +12,70 @@ extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Quantization
|
// Quantization
|
||||||
void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k);
|
void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k);
|
void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k);
|
void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k);
|
void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k);
|
void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k);
|
void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k);
|
void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k);
|
void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k);
|
void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
|
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
|
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
|
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
|
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
|
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int k);
|
void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int k);
|
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int k);
|
void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
// Dequantization
|
// Dequantization
|
||||||
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
// Dot product
|
// Dot product
|
||||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
@ -101,26 +101,26 @@ void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
|
||||||
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||||
|
|
||||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||||
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
|
|
||||||
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
|
|
||||||
void iq2xs_init_impl(enum ggml_type type);
|
void iq2xs_init_impl(enum ggml_type type);
|
||||||
void iq2xs_free_impl(enum ggml_type type);
|
void iq2xs_free_impl(enum ggml_type type);
|
||||||
|
|
|
@ -15776,7 +15776,7 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||||
#ifdef GGML_SYCL_FORCE_DMMV
|
#ifdef GGML_SYCL_FORCE_DMMV
|
||||||
const bool use_mul_mat_vec_q = false;
|
const bool use_mul_mat_vec_q = false;
|
||||||
#else
|
#else
|
||||||
bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
|
bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type);
|
||||||
use_mul_mat_vec_q = use_mul_mat_vec_q ||
|
use_mul_mat_vec_q = use_mul_mat_vec_q ||
|
||||||
(src0->type == GGML_TYPE_IQ2_XXS) || (src0->type == GGML_TYPE_IQ2_XS) || (src0->type == GGML_TYPE_IQ2_S) ||
|
(src0->type == GGML_TYPE_IQ2_XXS) || (src0->type == GGML_TYPE_IQ2_XS) || (src0->type == GGML_TYPE_IQ2_S) ||
|
||||||
(src0->type == GGML_TYPE_IQ3_XXS) || (src0->type == GGML_TYPE_IQ3_S) ||
|
(src0->type == GGML_TYPE_IQ3_XXS) || (src0->type == GGML_TYPE_IQ3_S) ||
|
||||||
|
@ -15787,7 +15787,6 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||||
#endif // GGML_SYCL_FORCE_DMMV
|
#endif // GGML_SYCL_FORCE_DMMV
|
||||||
|
|
||||||
if (use_mul_mat_vec_q) {
|
if (use_mul_mat_vec_q) {
|
||||||
// NOTE: this kernel does not support ggml_nrows(src1) > 1
|
|
||||||
// GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_vec_q path\n");
|
// GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_vec_q path\n");
|
||||||
ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
|
ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
|
||||||
} else {
|
} else {
|
||||||
|
|
16
ggml.c
16
ggml.c
|
@ -338,14 +338,14 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
||||||
return GGML_FP32_TO_FP16(x);
|
return GGML_FP32_TO_FP16(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n) {
|
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
|
||||||
for (int i = 0; i < n; i++) {
|
for (int64_t i = 0; i < n; i++) {
|
||||||
y[i] = GGML_FP16_TO_FP32(x[i]);
|
y[i] = GGML_FP16_TO_FP32(x[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) {
|
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
|
||||||
int i = 0;
|
int64_t i = 0;
|
||||||
#if defined(__F16C__)
|
#if defined(__F16C__)
|
||||||
for (; i + 7 < n; i += 8) {
|
for (; i + 7 < n; i += 8) {
|
||||||
__m256 x_vec = _mm256_loadu_ps(x + i);
|
__m256 x_vec = _mm256_loadu_ps(x + i);
|
||||||
|
@ -20332,11 +20332,11 @@ size_t ggml_quantize_chunk(
|
||||||
enum ggml_type type,
|
enum ggml_type type,
|
||||||
const float * src,
|
const float * src,
|
||||||
void * dst,
|
void * dst,
|
||||||
int start,
|
int64_t start,
|
||||||
int nrows,
|
int64_t nrows,
|
||||||
int n_per_row,
|
int64_t n_per_row,
|
||||||
const float * imatrix) {
|
const float * imatrix) {
|
||||||
const int n = nrows * n_per_row;
|
const int64_t n = (int64_t) nrows * n_per_row;
|
||||||
|
|
||||||
if (ggml_quantize_requires_imatrix(type)) {
|
if (ggml_quantize_requires_imatrix(type)) {
|
||||||
GGML_ASSERT(imatrix != NULL);
|
GGML_ASSERT(imatrix != NULL);
|
||||||
|
|
14
ggml.h
14
ggml.h
|
@ -339,8 +339,8 @@ extern "C" {
|
||||||
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
||||||
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
||||||
|
|
||||||
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
|
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
|
||||||
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
|
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
|
||||||
|
|
||||||
struct ggml_object;
|
struct ggml_object;
|
||||||
struct ggml_context;
|
struct ggml_context;
|
||||||
|
@ -2217,9 +2217,9 @@ extern "C" {
|
||||||
enum ggml_type type,
|
enum ggml_type type,
|
||||||
const float * src,
|
const float * src,
|
||||||
void * dst,
|
void * dst,
|
||||||
int start,
|
int64_t start,
|
||||||
int nrows,
|
int64_t nrows,
|
||||||
int n_per_row,
|
int64_t n_per_row,
|
||||||
const float * imatrix);
|
const float * imatrix);
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -2384,8 +2384,8 @@ extern "C" {
|
||||||
#else
|
#else
|
||||||
#define GGML_RESTRICT restrict
|
#define GGML_RESTRICT restrict
|
||||||
#endif
|
#endif
|
||||||
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||||
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
||||||
const void * GGML_RESTRICT y, size_t by, int nrc);
|
const void * GGML_RESTRICT y, size_t by, int nrc);
|
||||||
|
|
||||||
|
|
|
@ -639,6 +639,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_GATE,
|
MODEL_TENSOR.FFN_GATE,
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.ATTN_K_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q_NORM,
|
||||||
],
|
],
|
||||||
# TODO
|
# TODO
|
||||||
}
|
}
|
||||||
|
|
|
@ -285,12 +285,14 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.ATTN_Q_NORM: (
|
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||||
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
||||||
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
||||||
|
"model.layers.{bid}.self_attn.q_norm", # cohere
|
||||||
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.ATTN_K_NORM: (
|
MODEL_TENSOR.ATTN_K_NORM: (
|
||||||
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
|
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
|
||||||
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
||||||
|
"model.layers.{bid}.self_attn.k_norm", # cohere
|
||||||
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
||||||
),
|
),
|
||||||
|
|
||||||
|
|
|
@ -622,7 +622,7 @@ maxhordelen = 256
|
||||||
modelbusy = threading.Lock()
|
modelbusy = threading.Lock()
|
||||||
requestsinqueue = 0
|
requestsinqueue = 0
|
||||||
defaultport = 5001
|
defaultport = 5001
|
||||||
KcppVersion = "1.62"
|
KcppVersion = "1.62.1"
|
||||||
showdebug = True
|
showdebug = True
|
||||||
showsamplerwarning = True
|
showsamplerwarning = True
|
||||||
showmaxctxwarning = True
|
showmaxctxwarning = True
|
||||||
|
|
578
llama.cpp
578
llama.cpp
|
@ -948,6 +948,8 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -2207,7 +2209,7 @@ struct llama_context {
|
||||||
|
|
||||||
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
|
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
|
||||||
size_t output_size = 0; // capacity (of tokens positions) for the output buffers
|
size_t output_size = 0; // capacity (of tokens positions) for the output buffers
|
||||||
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch
|
int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
|
||||||
|
|
||||||
bool logits_all = false;
|
bool logits_all = false;
|
||||||
|
|
||||||
|
@ -5476,6 +5478,11 @@ static bool llm_load_tensors(
|
||||||
|
|
||||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||||
|
|
||||||
|
if (n_layer >= 64){
|
||||||
|
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head});
|
||||||
|
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv});
|
||||||
|
}
|
||||||
|
|
||||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||||
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||||
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||||
|
@ -9524,6 +9531,31 @@ struct llm_build_context {
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (model.layers[il].attn_q_norm) {
|
||||||
|
Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
|
||||||
|
ggml_element_size(Qcur) * n_embd_head,
|
||||||
|
ggml_element_size(Qcur) * n_embd_head * n_head,
|
||||||
|
0);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
|
||||||
|
ggml_element_size(Kcur) * n_embd_head,
|
||||||
|
ggml_element_size(Kcur) * n_embd_head * n_head_kv,
|
||||||
|
0);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
||||||
|
model.layers[il].attn_q_norm,
|
||||||
|
NULL,
|
||||||
|
LLM_NORM, cb, il);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
||||||
|
model.layers[il].attn_k_norm,
|
||||||
|
NULL,
|
||||||
|
LLM_NORM, cb, il);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
Qcur = ggml_rope_custom(
|
Qcur = ggml_rope_custom(
|
||||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||||
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
|
@ -10481,6 +10513,9 @@ static int llama_decode_internal(
|
||||||
n_outputs_prev += lctx.n_outputs;
|
n_outputs_prev += lctx.n_outputs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// set to total number of outputs in the batch, for use in llama_get_logits_ith
|
||||||
|
lctx.n_outputs = n_outputs;
|
||||||
|
|
||||||
// wait for the computation to finish (automatically done when obtaining the model output)
|
// wait for the computation to finish (automatically done when obtaining the model output)
|
||||||
//llama_synchronize(&lctx);
|
//llama_synchronize(&lctx);
|
||||||
|
|
||||||
|
@ -13635,9 +13670,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||||
return new_type;
|
return new_type;
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
||||||
std::mutex mutex;
|
std::mutex mutex;
|
||||||
int counter = 0;
|
int64_t counter = 0;
|
||||||
size_t new_size = 0;
|
size_t new_size = 0;
|
||||||
if (nthread < 2) {
|
if (nthread < 2) {
|
||||||
// single-thread
|
// single-thread
|
||||||
|
@ -13645,11 +13680,11 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
||||||
}
|
}
|
||||||
auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
|
auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
|
||||||
nrows, n_per_row, imatrix]() {
|
nrows, n_per_row, imatrix]() {
|
||||||
const int nrows_per_chunk = chunk_size / n_per_row;
|
const int64_t nrows_per_chunk = chunk_size / n_per_row;
|
||||||
size_t local_size = 0;
|
size_t local_size = 0;
|
||||||
while (true) {
|
while (true) {
|
||||||
std::unique_lock<std::mutex> lock(mutex);
|
std::unique_lock<std::mutex> lock(mutex);
|
||||||
int first_row = counter; counter += nrows_per_chunk;
|
int64_t first_row = counter; counter += nrows_per_chunk;
|
||||||
if (first_row >= nrows) {
|
if (first_row >= nrows) {
|
||||||
if (local_size > 0) {
|
if (local_size > 0) {
|
||||||
new_size += local_size;
|
new_size += local_size;
|
||||||
|
@ -13657,7 +13692,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
lock.unlock();
|
lock.unlock();
|
||||||
const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
||||||
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
|
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -13780,7 +13815,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
const std::string name = ggml_get_name(meta);
|
const std::string name = ggml_get_name(meta);
|
||||||
|
|
||||||
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
||||||
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
|
if (name.find("attn_v.weight") != std::string::npos ||
|
||||||
|
name.find("attn_qkv.weight") != std::string::npos) {
|
||||||
++qs.n_attention_wv;
|
++qs.n_attention_wv;
|
||||||
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
||||||
qs.has_output = true;
|
qs.has_output = true;
|
||||||
|
@ -13790,7 +13826,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
||||||
|
|
||||||
// sanity checks
|
// sanity checks
|
||||||
GGML_ASSERT(qs.n_attention_wv == (int)model.hparams.n_layer && "n_attention_wv != n_layer is unexpected");
|
//
|
||||||
|
// - qs.n_attention_wv == 0 for Mamba models
|
||||||
|
// - qs.n_attention_wv == model.hparams.n_layer for Transformer models
|
||||||
|
//
|
||||||
|
GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer) && "n_attention_wv is unexpected");
|
||||||
|
|
||||||
size_t total_size_org = 0;
|
size_t total_size_org = 0;
|
||||||
size_t total_size_new = 0;
|
size_t total_size_new = 0;
|
||||||
|
@ -13846,6 +13886,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
|
|
||||||
// quantize only 2D and 3D tensors (experts)
|
// quantize only 2D and 3D tensors (experts)
|
||||||
quantize &= (ggml_n_dims(tensor) >= 2);
|
quantize &= (ggml_n_dims(tensor) >= 2);
|
||||||
|
|
||||||
|
// do not quantize norm tensors
|
||||||
|
quantize &= name.find("_norm.weight") == std::string::npos;
|
||||||
|
|
||||||
quantize &= params->quantize_output_tensor || name != "output.weight";
|
quantize &= params->quantize_output_tensor || name != "output.weight";
|
||||||
quantize &= !params->only_copy;
|
quantize &= !params->only_copy;
|
||||||
|
|
||||||
|
@ -13874,10 +13918,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
if (!params->pure && ggml_is_quantized(default_type)) {
|
if (!params->pure && ggml_is_quantized(default_type)) {
|
||||||
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
|
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
|
||||||
}
|
}
|
||||||
else if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
|
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
|
||||||
new_type = params->token_embedding_type;
|
new_type = params->token_embedding_type;
|
||||||
}
|
}
|
||||||
else if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
|
if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
|
||||||
new_type = params->output_tensor_type;
|
new_type = params->output_tensor_type;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13892,7 +13936,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
new_size = ggml_nbytes(tensor);
|
new_size = ggml_nbytes(tensor);
|
||||||
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||||
} else {
|
} else {
|
||||||
const size_t nelements = ggml_nelements(tensor);
|
const int64_t nelements = ggml_nelements(tensor);
|
||||||
|
|
||||||
const float * imatrix = nullptr;
|
const float * imatrix = nullptr;
|
||||||
if (imatrix_data) {
|
if (imatrix_data) {
|
||||||
|
@ -13944,20 +13988,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
||||||
if (work.size() < nelements * 4) {
|
if (work.size() < (size_t)nelements * 4) {
|
||||||
work.resize(nelements * 4); // upper bound on size
|
work.resize(nelements * 4); // upper bound on size
|
||||||
}
|
}
|
||||||
new_data = work.data();
|
new_data = work.data();
|
||||||
|
|
||||||
const int n_per_row = tensor->ne[0];
|
const int64_t n_per_row = tensor->ne[0];
|
||||||
const int nrows = tensor->ne[1];
|
const int64_t nrows = tensor->ne[1];
|
||||||
|
|
||||||
static const int min_chunk_size = 32 * 512;
|
static const int64_t min_chunk_size = 32 * 512;
|
||||||
const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
|
const int64_t chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
|
||||||
|
|
||||||
const int nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
||||||
const int nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
||||||
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
|
||||||
|
|
||||||
// quantize each expert separately since they have different importance matrices
|
// quantize each expert separately since they have different importance matrices
|
||||||
new_size = 0;
|
new_size = 0;
|
||||||
|
@ -15212,9 +15256,33 @@ void llama_kv_cache_update(struct llama_context * ctx) {
|
||||||
llama_kv_cache_update_internal(*ctx);
|
llama_kv_cache_update_internal(*ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// deprecated
|
||||||
|
size_t llama_get_state_size(const struct llama_context * ctx) {
|
||||||
|
return llama_state_get_size(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// deprecated
|
||||||
|
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
||||||
|
return llama_state_get_data(ctx, dst);
|
||||||
|
}
|
||||||
|
|
||||||
|
// deprecated
|
||||||
|
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
||||||
|
return llama_state_set_data(ctx, src);
|
||||||
|
}
|
||||||
|
|
||||||
|
// deprecated
|
||||||
|
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
||||||
|
return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
||||||
|
}
|
||||||
|
|
||||||
|
// deprecated
|
||||||
|
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
||||||
|
return llama_state_save_file(ctx, path_session, tokens, n_token_count);
|
||||||
|
}
|
||||||
|
|
||||||
// Returns the *maximum* size of the state
|
// Returns the *maximum* size of the state
|
||||||
size_t llama_get_state_size(const struct llama_context * ctx) {
|
size_t llama_state_get_size(const struct llama_context * ctx) {
|
||||||
const auto & cparams = ctx->cparams;
|
const auto & cparams = ctx->cparams;
|
||||||
const auto & hparams = ctx->model.hparams;
|
const auto & hparams = ctx->model.hparams;
|
||||||
|
|
||||||
|
@ -15302,15 +15370,15 @@ struct llama_data_file_context : llama_data_context {
|
||||||
* file context:
|
* file context:
|
||||||
* llama_file file("/path", "wb");
|
* llama_file file("/path", "wb");
|
||||||
* llama_data_file_context data_ctx(&file);
|
* llama_data_file_context data_ctx(&file);
|
||||||
* llama_copy_state_data(ctx, &data_ctx);
|
* llama_state_get_data(ctx, &data_ctx);
|
||||||
*
|
*
|
||||||
* buffer context:
|
* buffer context:
|
||||||
* std::vector<uint8_t> buf(max_size, 0);
|
* std::vector<uint8_t> buf(max_size, 0);
|
||||||
* llama_data_buffer_context data_ctx(&buf.data());
|
* llama_data_buffer_context data_ctx(&buf.data());
|
||||||
* llama_copy_state_data(ctx, &data_ctx);
|
* llama_state_get_data(ctx, &data_ctx);
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
||||||
// copy rng
|
// copy rng
|
||||||
{
|
{
|
||||||
std::ostringstream rng_ss;
|
std::ostringstream rng_ss;
|
||||||
|
@ -15454,15 +15522,15 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) {
|
||||||
llama_data_buffer_context data_ctx(dst);
|
llama_data_buffer_context data_ctx(dst);
|
||||||
llama_copy_state_data_internal(ctx, &data_ctx);
|
llama_state_get_data_internal(ctx, &data_ctx);
|
||||||
|
|
||||||
return data_ctx.get_size_written();
|
return data_ctx.get_size_written();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sets the state reading from the specified source address
|
// Sets the state reading from the specified source address
|
||||||
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
||||||
const uint8_t * inp = src;
|
const uint8_t * inp = src;
|
||||||
|
|
||||||
// set rng
|
// set rng
|
||||||
|
@ -15614,14 +15682,14 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
||||||
}
|
}
|
||||||
|
|
||||||
const size_t nread = inp - src;
|
const size_t nread = inp - src;
|
||||||
const size_t max_size = llama_get_state_size(ctx);
|
const size_t max_size = llama_state_get_size(ctx);
|
||||||
|
|
||||||
GGML_ASSERT(nread <= max_size);
|
GGML_ASSERT(nread <= max_size);
|
||||||
|
|
||||||
return nread;
|
return nread;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
||||||
llama_file file(path_session, "rb");
|
llama_file file(path_session, "rb");
|
||||||
|
|
||||||
// sanity checks
|
// sanity checks
|
||||||
|
@ -15659,7 +15727,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
|
||||||
// restore the context state
|
// restore the context state
|
||||||
{
|
{
|
||||||
const size_t n_state_size_cur = file.size - file.tell();
|
const size_t n_state_size_cur = file.size - file.tell();
|
||||||
const size_t n_state_size_max = llama_get_state_size(ctx);
|
const size_t n_state_size_max = llama_state_get_size(ctx);
|
||||||
|
|
||||||
if (n_state_size_cur > n_state_size_max) {
|
if (n_state_size_cur > n_state_size_max) {
|
||||||
LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
|
LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur);
|
||||||
|
@ -15669,22 +15737,22 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
|
||||||
std::vector<uint8_t> state_data(n_state_size_max);
|
std::vector<uint8_t> state_data(n_state_size_max);
|
||||||
file.read_raw(state_data.data(), n_state_size_cur);
|
file.read_raw(state_data.data(), n_state_size_cur);
|
||||||
|
|
||||||
llama_set_state_data(ctx, state_data.data());
|
llama_state_set_data(ctx, state_data.data());
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
||||||
try {
|
try {
|
||||||
return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
|
LLAMA_LOG_ERROR("error loading session file: %s\n", err.what());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
||||||
llama_file file(path_session, "wb");
|
llama_file file(path_session, "wb");
|
||||||
|
|
||||||
file.write_u32(LLAMA_SESSION_MAGIC);
|
file.write_u32(LLAMA_SESSION_MAGIC);
|
||||||
|
@ -15698,11 +15766,420 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
|
||||||
|
|
||||||
// save the context state using stream saving
|
// save the context state using stream saving
|
||||||
llama_data_file_context data_ctx(&file);
|
llama_data_file_context data_ctx(&file);
|
||||||
llama_copy_state_data_internal(ctx, &data_ctx);
|
llama_state_get_data_internal(ctx, &data_ctx);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
||||||
|
try {
|
||||||
|
return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
|
||||||
|
} catch (const std::exception & err) {
|
||||||
|
LLAMA_LOG_ERROR("error saving session file: %s\n", err.what());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id) {
|
||||||
|
// save the size of size_t as a uint32_t for safety check
|
||||||
|
const size_t size_t_size_size = sizeof(uint32_t);
|
||||||
|
|
||||||
|
// other values
|
||||||
|
const size_t s_cell_count_size = sizeof(uint32_t);
|
||||||
|
const size_t s_layer_count_size = sizeof(uint32_t);
|
||||||
|
const size_t n_embd_v_gqa_size = sizeof(uint32_t);
|
||||||
|
|
||||||
|
size_t s_cell_count = 0;
|
||||||
|
size_t s_cell_data_size = 0;
|
||||||
|
const auto & kv_self = ctx->kv_self;
|
||||||
|
const auto & hparams = ctx->model.hparams;
|
||||||
|
|
||||||
|
const uint32_t n_layer = hparams.n_layer;
|
||||||
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
||||||
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
||||||
|
const auto & cell = kv_self.cells[i];
|
||||||
|
if (cell.seq_id.count(seq_id) > 0) {
|
||||||
|
++s_cell_count;
|
||||||
|
s_cell_data_size += sizeof(llama_pos);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int il = 0; il < (int)n_layer; ++il) {
|
||||||
|
// types of keys and values
|
||||||
|
s_cell_data_size += sizeof(int32_t) * 2;
|
||||||
|
// k_size_row and v_size_el values of layer
|
||||||
|
s_cell_data_size += sizeof(size_t) * 2;
|
||||||
|
|
||||||
|
// keys
|
||||||
|
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
||||||
|
s_cell_data_size += k_size_row * s_cell_count;
|
||||||
|
|
||||||
|
// values (transposed)
|
||||||
|
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
||||||
|
s_cell_data_size += v_size_el * s_cell_count * n_embd_v_gqa;
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t s_total = (
|
||||||
|
size_t_size_size +
|
||||||
|
s_cell_count_size +
|
||||||
|
s_layer_count_size +
|
||||||
|
n_embd_v_gqa_size +
|
||||||
|
s_cell_data_size
|
||||||
|
);
|
||||||
|
|
||||||
|
return s_total;
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) {
|
||||||
|
const auto & kv_self = ctx->kv_self;
|
||||||
|
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
||||||
|
|
||||||
|
// Save the size of size_t as a uint32_t for safety check
|
||||||
|
const uint32_t size_t_size = sizeof(size_t);
|
||||||
|
data_ctx.write(&size_t_size, sizeof(size_t_size));
|
||||||
|
|
||||||
|
std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
|
||||||
|
uint32_t cell_count = 0;
|
||||||
|
|
||||||
|
// Count the number of cells with the specified seq_id
|
||||||
|
// Find all the ranges of cells with this seq id
|
||||||
|
{
|
||||||
|
uint32_t cell_range_begin = kv_self.size;
|
||||||
|
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
||||||
|
const auto & cell = kv_self.cells[i];
|
||||||
|
if (cell.has_seq_id(seq_id)) {
|
||||||
|
++cell_count;
|
||||||
|
if (cell_range_begin == kv_self.size) {
|
||||||
|
cell_range_begin = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (cell_range_begin != kv_self.size) {
|
||||||
|
cell_ranges.push_back({ cell_range_begin, i });
|
||||||
|
cell_range_begin = kv_self.size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (cell_range_begin != kv_self.size) {
|
||||||
|
cell_ranges.push_back({ cell_range_begin, kv_self.size });
|
||||||
|
}
|
||||||
|
|
||||||
|
// DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
|
||||||
|
uint32_t cell_count_check = 0;
|
||||||
|
for (const auto & range : cell_ranges) {
|
||||||
|
cell_count_check += range.second - range.first;
|
||||||
|
}
|
||||||
|
GGML_ASSERT(cell_count == cell_count_check);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write the cell count
|
||||||
|
data_ctx.write(&cell_count, sizeof(cell_count));
|
||||||
|
|
||||||
|
const auto & hparams = ctx->model.hparams;
|
||||||
|
const uint32_t n_layer = hparams.n_layer;
|
||||||
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
||||||
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
||||||
|
|
||||||
|
// Write the layer count
|
||||||
|
data_ctx.write(&n_layer, sizeof(n_layer));
|
||||||
|
|
||||||
|
// Write n_embd_v_gqa
|
||||||
|
data_ctx.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
|
||||||
|
|
||||||
|
// Iterate the ranges and write all the pos (this is the token position in the prompt)
|
||||||
|
for (const auto & range : cell_ranges) {
|
||||||
|
for (uint32_t i = range.first; i < range.second; ++i) {
|
||||||
|
const auto & cell = kv_self.cells[i];
|
||||||
|
data_ctx.write(&cell.pos, sizeof(cell.pos));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Iterate and write all the keys first, each row is a cell
|
||||||
|
// Get whole range at a time
|
||||||
|
std::vector<uint8_t> tmp_buf;
|
||||||
|
for (int il = 0; il < (int)n_layer; ++il) {
|
||||||
|
// Write key type
|
||||||
|
const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
|
||||||
|
data_ctx.write(&k_type_i, sizeof(k_type_i));
|
||||||
|
|
||||||
|
// Write row size of key
|
||||||
|
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
||||||
|
data_ctx.write(&k_size_row, sizeof(k_size_row));
|
||||||
|
|
||||||
|
// Read each range of cells of k_size length each into tmp_buf and write out
|
||||||
|
for (const auto & range : cell_ranges) {
|
||||||
|
const size_t range_size = range.second - range.first;
|
||||||
|
tmp_buf.resize(range_size * k_size_row);
|
||||||
|
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row);
|
||||||
|
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// For the values, they are transposed, so we also need the element size and get the element ranges from each row
|
||||||
|
const uint32_t kv_size = kv_self.size;
|
||||||
|
for (int il = 0; il < (int)n_layer; ++il) {
|
||||||
|
// Write value type
|
||||||
|
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
||||||
|
data_ctx.write(&v_type_i, sizeof(v_type_i));
|
||||||
|
|
||||||
|
// Write element size
|
||||||
|
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
||||||
|
data_ctx.write(&v_size_el, sizeof(v_size_el));
|
||||||
|
|
||||||
|
// For each row, we get the element values of each cell
|
||||||
|
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
||||||
|
// Read each range of cells of v_size_el length each into tmp_buf and write out
|
||||||
|
for (const auto & range : cell_ranges) {
|
||||||
|
const size_t range_size = range.second - range.first;
|
||||||
|
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
|
||||||
|
tmp_buf.resize(range_size * v_size_el);
|
||||||
|
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
|
||||||
|
data_ctx.write(tmp_buf.data(), tmp_buf.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return data_ctx.get_size_written();
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_seq_id seq_id) {
|
||||||
|
llama_data_buffer_context data_ctx(dst);
|
||||||
|
return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) {
|
||||||
|
auto & kv_self = ctx->kv_self;
|
||||||
|
GGML_ASSERT(!kv_self.recurrent); // not implemented
|
||||||
|
|
||||||
|
// Wipe the slot
|
||||||
|
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
||||||
|
|
||||||
|
const uint8_t * inp = src;
|
||||||
|
|
||||||
|
// Read size of size_t
|
||||||
|
uint32_t size_t_size;
|
||||||
|
memcpy(&size_t_size, inp, sizeof(size_t_size));
|
||||||
|
inp += sizeof(size_t_size);
|
||||||
|
if (size_t_size != sizeof(size_t)) {
|
||||||
|
LLAMA_LOG_ERROR("%s: size_t size mismatch\n", __func__);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read the cell count
|
||||||
|
uint32_t cell_count;
|
||||||
|
memcpy(&cell_count, inp, sizeof(cell_count));
|
||||||
|
inp += sizeof(cell_count);
|
||||||
|
|
||||||
|
// Read the layer count
|
||||||
|
uint32_t n_layer_ref;
|
||||||
|
memcpy(&n_layer_ref, inp, sizeof(n_layer_ref));
|
||||||
|
inp += sizeof(n_layer_ref);
|
||||||
|
|
||||||
|
// Read n_embd_v_gqa
|
||||||
|
uint32_t n_embd_v_gqa_ref;
|
||||||
|
memcpy(&n_embd_v_gqa_ref, inp, sizeof(n_embd_v_gqa_ref));
|
||||||
|
inp += sizeof(n_embd_v_gqa_ref);
|
||||||
|
|
||||||
|
// Sanity check model compatibility
|
||||||
|
const auto & hparams = ctx->model.hparams;
|
||||||
|
const uint32_t n_layer = hparams.n_layer;
|
||||||
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
|
||||||
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
|
||||||
|
if (n_layer != n_layer_ref) {
|
||||||
|
LLAMA_LOG_ERROR("%s: mismatched n_layer (%d != %d)\n", __func__, n_layer, n_layer_ref);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (n_embd_v_gqa != n_embd_v_gqa_ref) {
|
||||||
|
LLAMA_LOG_ERROR("%s: mismatched n_embd_v_gqa (%d != %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Allocate the new cells for the slot
|
||||||
|
if (cell_count) {
|
||||||
|
llama_batch batch = llama_batch_init(cell_count, 0, 1);
|
||||||
|
batch.n_tokens = cell_count;
|
||||||
|
for (uint32_t i = 0; i < cell_count; ++i) {
|
||||||
|
llama_pos pos;
|
||||||
|
memcpy(&pos, inp, sizeof(pos));
|
||||||
|
inp += sizeof(pos);
|
||||||
|
|
||||||
|
batch.pos[i] = pos;
|
||||||
|
batch.n_seq_id[i] = 1;
|
||||||
|
batch.seq_id[i][0] = dest_seq_id;
|
||||||
|
}
|
||||||
|
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
||||||
|
llama_batch_free(batch);
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
|
||||||
|
// Assume that this is one contiguous block of cells
|
||||||
|
GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
|
||||||
|
GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
|
||||||
|
GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
|
||||||
|
GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
|
||||||
|
GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));
|
||||||
|
|
||||||
|
// Cleanup
|
||||||
|
llama_batch_free(batch);
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint32_t kv_size = kv_self.size;
|
||||||
|
const uint32_t kv_head = kv_self.head;
|
||||||
|
|
||||||
|
// For each layer, read the keys for each cell, one row is one cell, read as one contiguous blo
|
||||||
|
for (int il = 0; il < (int)n_layer; ++il) {
|
||||||
|
// Read type of key
|
||||||
|
int32_t k_type_i_ref;
|
||||||
|
memcpy(&k_type_i_ref, inp, sizeof(k_type_i_ref));
|
||||||
|
inp += sizeof(k_type_i_ref);
|
||||||
|
const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
|
||||||
|
if (k_type_i != k_type_i_ref) {
|
||||||
|
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
||||||
|
LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read row size of key
|
||||||
|
size_t k_size_row_ref;
|
||||||
|
memcpy(&k_size_row_ref, inp, sizeof(k_size_row_ref));
|
||||||
|
inp += sizeof(k_size_row_ref);
|
||||||
|
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
||||||
|
if (k_size_row != k_size_row_ref) {
|
||||||
|
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
||||||
|
LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, k_size_row_ref, il);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cell_count) {
|
||||||
|
// Read and set the keys for the whole cell range
|
||||||
|
ggml_backend_tensor_set(kv_self.k_l[il], inp, kv_head * k_size_row, cell_count * k_size_row);
|
||||||
|
inp += cell_count * k_size_row;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// For each layer, read the values for each cell (transposed)
|
||||||
|
for (int il = 0; il < (int)n_layer; ++il) {
|
||||||
|
// Read type of value
|
||||||
|
int32_t v_type_i_ref;
|
||||||
|
memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref));
|
||||||
|
inp += sizeof(v_type_i_ref);
|
||||||
|
const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
|
||||||
|
if (v_type_i != v_type_i_ref) {
|
||||||
|
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
||||||
|
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read element size of value
|
||||||
|
size_t v_size_el_ref;
|
||||||
|
memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref));
|
||||||
|
inp += sizeof(v_size_el_ref);
|
||||||
|
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
||||||
|
if (v_size_el != v_size_el_ref) {
|
||||||
|
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
|
||||||
|
LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cell_count) {
|
||||||
|
// For each row in the transposed matrix, read the values for the whole cell range
|
||||||
|
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
||||||
|
const size_t dst_offset = (kv_head + j * kv_size) * v_size_el;
|
||||||
|
ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el);
|
||||||
|
inp += cell_count * v_size_el;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t nread = inp - src;
|
||||||
|
return nread;
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
|
||||||
|
llama_file file(filepath, "wb");
|
||||||
|
|
||||||
|
file.write_u32(LLAMA_STATE_SEQ_MAGIC);
|
||||||
|
file.write_u32(LLAMA_STATE_SEQ_VERSION);
|
||||||
|
|
||||||
|
// save the prompt
|
||||||
|
file.write_u32((uint32_t)n_token_count);
|
||||||
|
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
|
||||||
|
|
||||||
|
// save the context state using stream saving
|
||||||
|
llama_data_file_context data_ctx(&file);
|
||||||
|
llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
|
||||||
|
|
||||||
|
const size_t res = file.tell();
|
||||||
|
GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written());
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
||||||
|
llama_file file(filepath, "rb");
|
||||||
|
|
||||||
|
// version checks
|
||||||
|
{
|
||||||
|
const uint32_t magic = file.read_u32();
|
||||||
|
const uint32_t version = file.read_u32();
|
||||||
|
|
||||||
|
if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
|
||||||
|
LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// load the prompt
|
||||||
|
{
|
||||||
|
const uint32_t n_token_count = file.read_u32();
|
||||||
|
|
||||||
|
if (n_token_count > n_token_capacity) {
|
||||||
|
LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
|
||||||
|
*n_token_count_out = n_token_count;
|
||||||
|
}
|
||||||
|
|
||||||
|
// restore the context state
|
||||||
|
{
|
||||||
|
const size_t state_size = file.size - file.tell();
|
||||||
|
std::vector<uint8_t> state_data(state_size);
|
||||||
|
file.read_raw(state_data.data(), state_size);
|
||||||
|
const size_t nread = llama_state_seq_set_data(ctx, state_data.data(), dest_seq_id);
|
||||||
|
if (!nread) {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
GGML_ASSERT(nread <= state_size);
|
||||||
|
GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
|
||||||
|
}
|
||||||
|
|
||||||
|
return file.tell();
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
|
||||||
|
try {
|
||||||
|
return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count);
|
||||||
|
} catch (const std::exception & err) {
|
||||||
|
LLAMA_LOG_ERROR("error saving sequence state file: %s\n", err.what());
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
||||||
|
try {
|
||||||
|
return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out);
|
||||||
|
} catch (const std::exception & err) {
|
||||||
|
LLAMA_LOG_ERROR("error loading sequence state file: %s\n", err.what());
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void printcache(struct llama_context * ctx)
|
void printcache(struct llama_context * ctx)
|
||||||
{
|
{
|
||||||
struct llama_kv_cache & cache = ctx->kv_self;
|
struct llama_kv_cache & cache = ctx->kv_self;
|
||||||
|
@ -15826,23 +16303,31 @@ float * llama_get_logits(struct llama_context * ctx) {
|
||||||
}
|
}
|
||||||
|
|
||||||
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
||||||
|
int32_t j = -1;
|
||||||
llama_synchronize(ctx);
|
llama_synchronize(ctx);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (ctx->logits == nullptr) {
|
if (ctx->logits == nullptr) {
|
||||||
throw std::runtime_error("no logits");
|
throw std::runtime_error("no logits");
|
||||||
}
|
}
|
||||||
if ((size_t) i >= ctx->output_ids.size()) {
|
|
||||||
|
if (i < 0) {
|
||||||
|
j = ctx->n_outputs + i;
|
||||||
|
if (j < 0) {
|
||||||
|
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
|
||||||
|
}
|
||||||
|
} else if ((size_t) i >= ctx->output_ids.size()) {
|
||||||
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
|
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
|
||||||
|
} else {
|
||||||
|
j = ctx->output_ids[i];
|
||||||
}
|
}
|
||||||
const int32_t j = ctx->output_ids[i];
|
|
||||||
|
|
||||||
if (j < 0) {
|
if (j < 0) {
|
||||||
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
||||||
}
|
}
|
||||||
if ((size_t) j >= ctx->output_size) {
|
if (j >= ctx->n_outputs) {
|
||||||
// This should not happen
|
// This should not happen
|
||||||
throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
|
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
|
||||||
}
|
}
|
||||||
|
|
||||||
return ctx->logits + j*ctx->model.hparams.n_vocab;
|
return ctx->logits + j*ctx->model.hparams.n_vocab;
|
||||||
|
@ -15862,23 +16347,32 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
||||||
}
|
}
|
||||||
|
|
||||||
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
||||||
|
int32_t j = -1;
|
||||||
|
|
||||||
llama_synchronize(ctx);
|
llama_synchronize(ctx);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (ctx->embd == nullptr) {
|
if (ctx->embd == nullptr) {
|
||||||
throw std::runtime_error("no embeddings");
|
throw std::runtime_error("no embeddings");
|
||||||
}
|
}
|
||||||
if ((size_t) i >= ctx->output_ids.size()) {
|
|
||||||
|
if (i < 0) {
|
||||||
|
j = ctx->n_outputs + i;
|
||||||
|
if (j < 0) {
|
||||||
|
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
|
||||||
|
}
|
||||||
|
} else if ((size_t) i >= ctx->output_ids.size()) {
|
||||||
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
|
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
|
||||||
|
} else {
|
||||||
|
j = ctx->output_ids[i];
|
||||||
}
|
}
|
||||||
const int32_t j = ctx->output_ids[i];
|
|
||||||
|
|
||||||
if (j < 0) {
|
if (j < 0) {
|
||||||
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
throw std::runtime_error(format("batch.logits[%d] != true", i));
|
||||||
}
|
}
|
||||||
if ((size_t) j >= ctx->output_size) {
|
if (j >= ctx->n_outputs) {
|
||||||
// This should not happen
|
// This should not happen
|
||||||
throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size));
|
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
|
||||||
}
|
}
|
||||||
|
|
||||||
return ctx->embd + j*ctx->model.hparams.n_embd;
|
return ctx->embd + j*ctx->model.hparams.n_embd;
|
||||||
|
|
79
llama.h
79
llama.h
|
@ -37,10 +37,14 @@
|
||||||
|
|
||||||
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
||||||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||||
|
#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
|
||||||
|
|
||||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||||
#define LLAMA_SESSION_VERSION 5
|
#define LLAMA_SESSION_VERSION 5
|
||||||
|
|
||||||
|
#define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
|
||||||
|
#define LLAMA_STATE_SEQ_VERSION 1
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
@ -523,6 +527,7 @@ extern "C" {
|
||||||
struct llama_context * ctx);
|
struct llama_context * ctx);
|
||||||
|
|
||||||
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||||
|
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
||||||
// seq_id < 0 : match any sequence
|
// seq_id < 0 : match any sequence
|
||||||
// p0 < 0 : [0, p1]
|
// p0 < 0 : [0, p1]
|
||||||
// p1 < 0 : [p0, inf)
|
// p1 < 0 : [p0, inf)
|
||||||
|
@ -596,34 +601,92 @@ extern "C" {
|
||||||
|
|
||||||
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
||||||
// and kv_cache) - will often be smaller after compacting tokens
|
// and kv_cache) - will often be smaller after compacting tokens
|
||||||
LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
|
LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx);
|
||||||
|
LLAMA_API DEPRECATED(size_t llama_get_state_size(const struct llama_context * ctx),
|
||||||
|
"use llama_state_get_size instead");
|
||||||
|
|
||||||
// Copies the state to the specified destination address.
|
// Copies the state to the specified destination address.
|
||||||
// Destination needs to have allocated enough memory.
|
// Destination needs to have allocated enough memory.
|
||||||
// Returns the number of bytes copied
|
// Returns the number of bytes copied
|
||||||
LLAMA_API size_t llama_copy_state_data(
|
LLAMA_API size_t llama_state_get_data(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
uint8_t * dst);
|
uint8_t * dst);
|
||||||
|
LLAMA_API DEPRECATED(size_t llama_copy_state_data(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
uint8_t * dst),
|
||||||
|
"use llama_state_get_data instead");
|
||||||
|
|
||||||
// Set the state reading from the specified address
|
// Set the state reading from the specified address
|
||||||
// Returns the number of bytes read
|
// Returns the number of bytes read
|
||||||
LLAMA_API size_t llama_set_state_data(
|
LLAMA_API size_t llama_state_set_data(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
const uint8_t * src);
|
const uint8_t * src);
|
||||||
|
LLAMA_API DEPRECATED(size_t llama_set_state_data(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
const uint8_t * src),
|
||||||
|
"use llama_state_set_data instead");
|
||||||
|
|
||||||
// Save/load session file
|
// Save/load session file
|
||||||
LLAMA_API bool llama_load_session_file(
|
LLAMA_API bool llama_state_load_file(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
const char * path_session,
|
const char * path_session,
|
||||||
llama_token * tokens_out,
|
llama_token * tokens_out,
|
||||||
size_t n_token_capacity,
|
size_t n_token_capacity,
|
||||||
size_t * n_token_count_out);
|
size_t * n_token_count_out);
|
||||||
|
LLAMA_API DEPRECATED(bool llama_load_session_file(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
const char * path_session,
|
||||||
|
llama_token * tokens_out,
|
||||||
|
size_t n_token_capacity,
|
||||||
|
size_t * n_token_count_out),
|
||||||
|
"use llama_state_load_file instead");
|
||||||
|
|
||||||
LLAMA_API bool llama_save_session_file(
|
LLAMA_API bool llama_state_save_file(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
const char * path_session,
|
const char * path_session,
|
||||||
const llama_token * tokens,
|
const llama_token * tokens,
|
||||||
size_t n_token_count);
|
size_t n_token_count);
|
||||||
|
LLAMA_API DEPRECATED(bool llama_save_session_file(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
const char * path_session,
|
||||||
|
const llama_token * tokens,
|
||||||
|
size_t n_token_count),
|
||||||
|
"use llama_state_save_file instead");
|
||||||
|
|
||||||
|
// Get the exact size needed to copy the KV cache of a single sequence
|
||||||
|
LLAMA_API size_t llama_state_seq_get_size(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
llama_seq_id seq_id);
|
||||||
|
|
||||||
|
// Copy the KV cache of a single sequence into the specified buffer
|
||||||
|
LLAMA_API size_t llama_state_seq_get_data(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
uint8_t * dst,
|
||||||
|
llama_seq_id seq_id);
|
||||||
|
|
||||||
|
// Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
|
||||||
|
// Returns:
|
||||||
|
// - Positive: Ok
|
||||||
|
// - Zero: Failed to load
|
||||||
|
LLAMA_API size_t llama_state_seq_set_data(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
const uint8_t * src,
|
||||||
|
llama_seq_id dest_seq_id);
|
||||||
|
|
||||||
|
LLAMA_API size_t llama_state_seq_save_file(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
const char * filepath,
|
||||||
|
llama_seq_id seq_id,
|
||||||
|
const llama_token * tokens,
|
||||||
|
size_t n_token_count);
|
||||||
|
|
||||||
|
LLAMA_API size_t llama_state_seq_load_file(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
const char * filepath,
|
||||||
|
llama_seq_id dest_seq_id,
|
||||||
|
llama_token * tokens_out,
|
||||||
|
size_t n_token_capacity,
|
||||||
|
size_t * n_token_count_out);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Decoding
|
// Decoding
|
||||||
|
@ -686,8 +749,9 @@ extern "C" {
|
||||||
// Cols: n_vocab
|
// Cols: n_vocab
|
||||||
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
||||||
|
|
||||||
// Logits for the ith token. Equivalent to:
|
// Logits for the ith token. For positive indices, Equivalent to:
|
||||||
// llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
|
// llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
|
||||||
|
// Negative indicies can be used to access logits in reverse order, -1 is the last logit.
|
||||||
// returns NULL for invalid ids.
|
// returns NULL for invalid ids.
|
||||||
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
||||||
|
|
||||||
|
@ -699,8 +763,9 @@ extern "C" {
|
||||||
// Otherwise, returns NULL.
|
// Otherwise, returns NULL.
|
||||||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||||
|
|
||||||
// Get the embeddings for the ith token. Equivalent to:
|
// Get the embeddings for the ith token. For positive indices, Equivalent to:
|
||||||
// llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
|
// llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
|
||||||
|
// Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
|
||||||
// shape: [n_embd] (1-dimensional)
|
// shape: [n_embd] (1-dimensional)
|
||||||
// returns NULL for invalid ids.
|
// returns NULL for invalid ids.
|
||||||
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
||||||
|
|
9
scripts/gen-authors.sh
Executable file
9
scripts/gen-authors.sh
Executable file
|
@ -0,0 +1,9 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
printf "# date: $(date)\n" > AUTHORS
|
||||||
|
printf "# this file is auto-generated by scripts/gen-authors.sh\n\n" >> AUTHORS
|
||||||
|
|
||||||
|
git log --format='%an <%ae>' --reverse --date=short master | awk '!seen[$0]++' | sort >> AUTHORS
|
||||||
|
|
||||||
|
# if necessary, update your name here. for example: jdoe -> John Doe
|
||||||
|
sed -i '' 's/^jdoe/John Doe/g' AUTHORS
|
Loading…
Add table
Add a link
Reference in a new issue