tensor split active text

This commit is contained in:
Concedo 2024-02-09 12:02:23 +08:00
parent 22a4d84050
commit fe424a5466
2 changed files with 3 additions and 1 deletions

View file

@ -875,6 +875,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
}
if(!ts_all_zero)
{
printf("\nApplying Tensor Split...");
llama_ctx_params.tensor_split = inputs.tensor_split;
}
#endif
@ -976,6 +977,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
}
if(!ts_all_zero)
{
printf("\nApplying Tensor Split...");
model_params.tensor_split = inputs.tensor_split;
}
#endif

View file

@ -2644,7 +2644,7 @@ if __name__ == '__main__':
compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'mmq'])
compatgroup.add_argument("--usevulkan", help="Use Vulkan for GPU Acceleration. Can optionally specify GPU Device ID (e.g. --usevulkan 0).", metavar=('[Device ID]'), nargs='*', type=int, default=None)
parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), nargs='?', const=1, type=int, default=0)
parser.add_argument("--tensor_split", help="For CUDA with ALL GPU set only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+')
parser.add_argument("--tensor_split", help="For CUDA and Vulkan only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+')
parser.add_argument("--onready", help="An optional shell command to execute after the model has been loaded.", metavar=('[shell command]'), type=str, default="",nargs=1)
parser.add_argument("--benchmark", help="Do not start server, instead run benchmarks. If filename is provided, appends results to provided file.", metavar=('[filename]'), nargs='?', const="stdout", type=str, default=None)
parser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", metavar=('limit'), nargs='?', const=1, type=int, default=0)