diff --git a/.github/workflows/kcpp-build-release-osx.yaml b/.github/workflows/kcpp-build-release-osx.yaml index ce4be1276..29b2676aa 100644 --- a/.github/workflows/kcpp-build-release-osx.yaml +++ b/.github/workflows/kcpp-build-release-osx.yaml @@ -29,8 +29,8 @@ jobs: - name: Test id: test run: | - wget https://huggingface.co/concedo/KobbleTinyV2-1.1B-GGUF/resolve/main/KobbleTiny-Q4_K.gguf - dist/koboldcpp-mac-arm64 --model KobbleTiny-Q4_K.gguf --gpulayers 99 --benchmark dist/bench.csv + wget https://huggingface.co/concedo/koboldcpp/resolve/main/baby_llama.gguf + dist/koboldcpp-mac-arm64 --model baby_llama.gguf --gpulayers 99 --benchmark --prompt 'Hi, my name is' - name: Save artifact uses: actions/upload-artifact@v3 diff --git a/koboldcpp.py b/koboldcpp.py index e96b51056..0563664c1 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -875,15 +875,17 @@ def generate(prompt, memory="", images=[], max_length=32, max_context_length=512 inputs.images[n] = "".encode("UTF-8") else: inputs.images[n] = images[n].encode("UTF-8") - if max_length >= (max_context_length-1): - max_length = max_context_length-1 - print("\nWarning: You are trying to generate with max_length near or exceeding max_context_length. Most of the context will be removed, and your outputs will not be very coherent.") global showmaxctxwarning if max_context_length > maxctx: if showmaxctxwarning: print(f"\n(Warning! Request max_context_length={max_context_length} exceeds allocated context size of {maxctx}. It will be reduced to fit. Consider launching with increased --contextsize to avoid errors. This message will only show once per session.)") showmaxctxwarning = False max_context_length = maxctx + min_remain = min(max_context_length-4, 16) + if max_length >= (max_context_length-min_remain): + max_length = max_context_length-min_remain + print("\nWarning: You are trying to generate with max_length near or exceeding max_context_length. Most of the context will be removed, and your outputs will not be very coherent.") + inputs.max_context_length = max_context_length # this will resize the context buffer if changed inputs.max_length = max_length inputs.temperature = temperature @@ -1471,7 +1473,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): current_token = 0 incomplete_token_buffer = bytearray() async_sleep_short = 0.02 - await asyncio.sleep(0.25) #anti race condition, prevent check from overtaking generate + await asyncio.sleep(0.3) #anti race condition, prevent check from overtaking generate try: tokenReserve = "" #keeps fully formed tokens that we cannot send out yet while True: @@ -2042,6 +2044,7 @@ Enter Prompt:
return finally: + time.sleep(0.05) modelbusy.release() self.send_response(404) @@ -4209,10 +4212,20 @@ def main(launch_args,start_server=True): save_to_file = (args.benchmark and args.benchmark!="stdout" and args.benchmark!="") benchmaxctx = maxctx benchlen = 100 + benchtemp = 0.1 + benchtopk = 1 + benchreppen = 1 + benchbaneos = True benchmodel = sanitize_string(os.path.splitext(os.path.basename(modelname))[0]) benchprompt = "" if args.prompt: benchprompt = args.prompt + benchtopk = 100 + benchreppen = 1.07 + benchtemp = 0.8 + if not args.benchmark: + benchbaneos = False + benchlen = 256 if args.benchmark: if os.path.exists(args.benchmark) and os.path.getsize(args.benchmark) > 1000000: print(f"\nWarning: The benchmark CSV output file you selected exceeds 1MB. This is probably not what you want, did you select the wrong CSV file?\nFor safety, benchmark output will not be saved.") @@ -4225,7 +4238,7 @@ def main(launch_args,start_server=True): benchprompt = "1111111111111111" for i in range(0,14): #generate massive prompt benchprompt += benchprompt - genout = generate(benchprompt,memory="",images=[],max_length=benchlen,max_context_length=benchmaxctx,temperature=0.1,top_k=1,rep_pen=1,ban_eos_token=True) + genout = generate(benchprompt,memory="",images=[],max_length=benchlen,max_context_length=benchmaxctx,temperature=benchtemp,top_k=benchtopk,rep_pen=benchreppen,ban_eos_token=benchbaneos) result = genout['text'] if args.prompt and not args.benchmark: restore_stdout()