fixes to stopper tokens, fixed BLAS mode for GPT2 and GPTJ, updated kobold lite

2025-09-10 09:04:36 +00:00 · 2023-04-16 21:54:18 +08:00 · 2023-04-16 21:54:18 +08:00 · c757fbee1d
commit c757fbee1d
parent 6548d3b3fb
6 changed files with 17 additions and 14 deletions
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -453,7 +453,7 @@ if __name__ == '__main__':
    default_threads = (physical_core_limit if physical_core_limit<=3 else max(3,physical_core_limit-1))
    parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads)
    parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
-    parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512)", type=int,choices=[128,256,512,1024], default=512)
+    parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512)", type=int,choices=[64,128,256,512,1024], default=512)
    parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true')
    parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
    parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')