mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
limit default threads to max 8 to deal with ecores
This commit is contained in:
parent
ff5743100e
commit
09adfa70ad
2 changed files with 2 additions and 1 deletions
|
@ -58,7 +58,7 @@ when you can't use the precompiled binary directly, we provide an automated buil
|
||||||
- For Arch Linux: Install `cblas` `openblas` and `clblast`.
|
- For Arch Linux: Install `cblas` `openblas` and `clblast`.
|
||||||
- For Debian: Install `libclblast-dev` and `libopenblas-dev`.
|
- For Debian: Install `libclblast-dev` and `libopenblas-dev`.
|
||||||
- You can attempt a CuBLAS build with `LLAMA_CUBLAS=1`. You will need CUDA Toolkit installed. Some have also reported success with the CMake file, though that is more for windows.
|
- You can attempt a CuBLAS build with `LLAMA_CUBLAS=1`. You will need CUDA Toolkit installed. Some have also reported success with the CMake file, though that is more for windows.
|
||||||
- For a full featured build (all backends), do `make LLAMA_OPENBLAS=1 LLAMA_CLBLAST=1 LLAMA_CUBLAS=1 LLAMA_VULKAN=1`
|
- For a full featured build (all backends), do `make LLAMA_OPENBLAS=1 LLAMA_CLBLAST=1 LLAMA_CUBLAS=1 LLAMA_VULKAN=1`. (Note that `LLAMA_CUBLAS=1` will not work on windows, you need visual studio)
|
||||||
- After all binaries are built, you can run the python script with the command `koboldcpp.py [ggml_model.bin] [port]`
|
- After all binaries are built, you can run the python script with the command `koboldcpp.py [ggml_model.bin] [port]`
|
||||||
|
|
||||||
- Note: Many OSX users have found that the using Accelerate is actually faster than OpenBLAS. To try, you may wish to run with `--noblas` and compare speeds.
|
- Note: Many OSX users have found that the using Accelerate is actually faster than OpenBLAS. To try, you may wish to run with `--noblas` and compare speeds.
|
||||||
|
|
|
@ -3465,6 +3465,7 @@ if __name__ == '__main__':
|
||||||
if os.cpu_count()!=None and os.cpu_count()>1:
|
if os.cpu_count()!=None and os.cpu_count()>1:
|
||||||
physical_core_limit = int(os.cpu_count()/2)
|
physical_core_limit = int(os.cpu_count()/2)
|
||||||
default_threads = (physical_core_limit if physical_core_limit<=3 else max(3,physical_core_limit-1))
|
default_threads = (physical_core_limit if physical_core_limit<=3 else max(3,physical_core_limit-1))
|
||||||
|
default_threads = (8 if default_threads > 8 else default_threads) #there is zero reason to exceed 8 threads by default. this helps avoid e-cores.
|
||||||
parser.add_argument("--threads", metavar=('[threads]'), help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads)
|
parser.add_argument("--threads", metavar=('[threads]'), help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads)
|
||||||
compatgroup = parser.add_mutually_exclusive_group()
|
compatgroup = parser.add_mutually_exclusive_group()
|
||||||
compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq] [rowsplit]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'mmq', 'rowsplit'])
|
compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq] [rowsplit]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'mmq', 'rowsplit'])
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue