mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
integrated libopenblas for greatly accelerated prompt processing. Windows binaries are included - feel free to build your own or to build for other platforms, but that is beyond the scope of this repo. Will fall back to non-blas if libopenblas is removed.
This commit is contained in:
parent
49c4c225b5
commit
664b277c27
12 changed files with 624 additions and 15 deletions
|
@ -31,14 +31,23 @@ class generation_outputs(ctypes.Structure):
|
|||
_fields_ = [("status", ctypes.c_int),
|
||||
("text", ctypes.c_char * 16384)]
|
||||
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
handle = ctypes.CDLL(os.path.join(dir_path, "llamacpp.dll"))
|
||||
handle = None
|
||||
use_blas = False # if true, uses OpenBLAS for acceleration. libopenblas.dll must exist in the same dir.
|
||||
|
||||
handle.load_model.argtypes = [load_model_inputs]
|
||||
handle.load_model.restype = ctypes.c_bool
|
||||
handle.generate.argtypes = [generation_inputs, ctypes.c_wchar_p] #apparently needed for osx to work. i duno why they need to interpret it that way but whatever
|
||||
handle.generate.restype = generation_outputs
|
||||
|
||||
def init_library():
|
||||
global handle, use_blas
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
if use_blas:
|
||||
#OpenBLAS should provide about a 2x speedup on prompt ingestion if compatible.
|
||||
handle = ctypes.CDLL(os.path.join(dir_path, "llamacpp_blas.dll"))
|
||||
else:
|
||||
handle = ctypes.CDLL(os.path.join(dir_path, "llamacpp.dll"))
|
||||
|
||||
handle.load_model.argtypes = [load_model_inputs]
|
||||
handle.load_model.restype = ctypes.c_bool
|
||||
handle.generate.argtypes = [generation_inputs, ctypes.c_wchar_p] #apparently needed for osx to work. i duno why they need to interpret it that way but whatever
|
||||
handle.generate.restype = generation_outputs
|
||||
|
||||
def load_model(model_filename,batch_size=8,max_context_length=512,n_parts_overwrite=-1,threads=6):
|
||||
inputs = load_model_inputs()
|
||||
inputs.model_filename = model_filename.encode("UTF-8")
|
||||
|
@ -276,6 +285,14 @@ def RunServerMultiThreaded(addr, port, embedded_kailite = None):
|
|||
sys.exit(0)
|
||||
|
||||
def main(args):
|
||||
global use_blas
|
||||
if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "libopenblas.dll")):
|
||||
print("Warning: libopenblas.dll not found. OpenBLAS will be disabled.")
|
||||
use_blas = False
|
||||
elif not args.noblas:
|
||||
print("Attempting to use OpenBLAS library for faster prompt ingestion. A compatible libopenblas.dll will be required.")
|
||||
use_blas = True
|
||||
init_library() # Note: if blas does not exist and is enabled, program will crash.
|
||||
ggml_selected_file = args.model_file
|
||||
embedded_kailite = None
|
||||
if not ggml_selected_file:
|
||||
|
@ -331,5 +348,6 @@ if __name__ == '__main__':
|
|||
default_threads = (os.cpu_count() if os.cpu_count()<=6 else max(6,os.cpu_count()-2))
|
||||
parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads)
|
||||
parser.add_argument("--nostream", help="Disables pseudo streaming", action='store_true')
|
||||
parser.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue