multiple minor fixes

This commit is contained in:
Concedo 2024-05-17 15:47:53 +08:00
parent 51aa32b928
commit 1db3421c52
3 changed files with 64 additions and 53 deletions

View file

@ -2296,6 +2296,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
printf("]\n"); printf("]\n");
} }
bool earlystopped = false;
if(!inputs.bypass_eos_token && inputs.allow_eos_token && (id==eosID || (id==eotID && id!=-1))) if(!inputs.bypass_eos_token && inputs.allow_eos_token && (id==eosID || (id==eotID && id!=-1)))
{ {
stopper_unused_tokens = remaining_tokens; stopper_unused_tokens = remaining_tokens;
@ -2305,39 +2306,49 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
} }
remaining_tokens = 0; remaining_tokens = 0;
last_stop_reason = stop_reason::EOS_TOKEN_HIT; last_stop_reason = stop_reason::EOS_TOKEN_HIT;
earlystopped = true;
} }
for (const auto &matched : special_stop_sequence) if(!earlystopped)
{ {
if(id==matched) for (const auto &matched : special_stop_sequence)
{ {
stopper_unused_tokens = remaining_tokens; if(id==matched)
if(allow_regular_prints)
{ {
printf("\n(Special Stop Token Triggered! ID:%d)",matched); stopper_unused_tokens = remaining_tokens;
if(allow_regular_prints)
{
printf("\n(Special Stop Token Triggered! ID:%d)",matched);
}
remaining_tokens = 0;
last_stop_reason = stop_reason::EOS_TOKEN_HIT;
earlystopped = true;
break;
} }
remaining_tokens = 0;
last_stop_reason = stop_reason::EOS_TOKEN_HIT;
break;
} }
} }
for (const auto &matched : stop_sequence) if(!earlystopped)
{ {
if (concat_output.find(matched) != std::string::npos) for (const auto &matched : stop_sequence)
{ {
stopper_unused_tokens = remaining_tokens; if (concat_output.find(matched) != std::string::npos)
remaining_tokens = 0;
if(allow_regular_prints)
{ {
auto match_clean = matched; stopper_unused_tokens = remaining_tokens;
replace_all(match_clean, "\n", "\\n"); remaining_tokens = 0;
printf("\n(Stop sequence triggered: %s)", match_clean.c_str()); if(allow_regular_prints)
{
auto match_clean = matched;
replace_all(match_clean, "\n", "\\n");
printf("\n(Stop sequence triggered: %s)", match_clean.c_str());
}
last_stop_reason = stop_reason::CUSTOM_STOPPER;
earlystopped = true;
break;
} }
last_stop_reason = stop_reason::CUSTOM_STOPPER;
break;
} }
} }
fflush(stdout); fflush(stdout);
} }
else else

File diff suppressed because one or more lines are too long

View file

@ -1813,7 +1813,7 @@ def show_new_gui():
# decided to follow yellowrose's and kalomaze's suggestions, this function will automatically try to determine GPU identifiers # decided to follow yellowrose's and kalomaze's suggestions, this function will automatically try to determine GPU identifiers
# run in new thread so it doesnt block. does not return anything, instead overwrites specific values and redraws GUI # run in new thread so it doesnt block. does not return anything, instead overwrites specific values and redraws GUI
def auto_gpu_heuristics(): def auto_gpu_heuristics():
from subprocess import run, CalledProcessError import subprocess
FetchedCUdevices = [] FetchedCUdevices = []
FetchedCUdeviceMem = [] FetchedCUdeviceMem = []
AMDgpu = None AMDgpu = None
@ -1822,10 +1822,10 @@ def show_new_gui():
output = "" output = ""
data = None data = None
try: try:
output = run(["clinfo","--json"], capture_output=True, text=True, check=True, encoding='utf-8').stdout output = subprocess.run(["clinfo","--json"], capture_output=True, text=True, check=True, encoding='utf-8').stdout
data = json.loads(output) data = json.loads(output)
except Exception as e1: except Exception as e1:
output = run([((os.path.join(basepath, "winclinfo.exe")) if os.name == 'nt' else "clinfo"),"--json"], capture_output=True, text=True, check=True, encoding='utf-8').stdout output = subprocess.run([((os.path.join(basepath, "winclinfo.exe")) if os.name == 'nt' else "clinfo"),"--json"], capture_output=True, text=True, check=True, creationflags=subprocess.CREATE_NO_WINDOW | subprocess.DETACHED_PROCESS, encoding='utf-8').stdout
data = json.loads(output) data = json.loads(output)
plat = 0 plat = 0
dev = 0 dev = 0
@ -1846,7 +1846,7 @@ def show_new_gui():
pass pass
try: # Get NVIDIA GPU names try: # Get NVIDIA GPU names
output = run(['nvidia-smi','--query-gpu=name,memory.total','--format=csv,noheader'], capture_output=True, text=True, check=True, encoding='utf-8').stdout output = subprocess.run(['nvidia-smi','--query-gpu=name,memory.total','--format=csv,noheader'], capture_output=True, text=True, check=True, encoding='utf-8').stdout
FetchedCUdevices = [line.split(",")[0].strip() for line in output.splitlines()] FetchedCUdevices = [line.split(",")[0].strip() for line in output.splitlines()]
FetchedCUdeviceMem = [line.split(",")[1].strip().split(" ")[0].strip() for line in output.splitlines()] FetchedCUdeviceMem = [line.split(",")[1].strip().split(" ")[0].strip() for line in output.splitlines()]
except Exception as e: except Exception as e:
@ -1854,7 +1854,7 @@ def show_new_gui():
if len(FetchedCUdevices)==0: if len(FetchedCUdevices)==0:
try: # Get AMD ROCm GPU names try: # Get AMD ROCm GPU names
output = run(['rocminfo'], capture_output=True, text=True, check=True, encoding='utf-8').stdout output = subprocess.run(['rocminfo'], capture_output=True, text=True, check=True, encoding='utf-8').stdout
device_name = None device_name = None
for line in output.splitlines(): # read through the output line by line for line in output.splitlines(): # read through the output line by line
line = line.strip() line = line.strip()
@ -1864,13 +1864,13 @@ def show_new_gui():
AMDgpu = True AMDgpu = True
elif line.startswith("Device Type:") and "GPU" not in line: device_name = None elif line.startswith("Device Type:") and "GPU" not in line: device_name = None
if FetchedCUdevices: if FetchedCUdevices:
getamdvram = run(['rocm-smi', '--showmeminfo', 'vram', '--csv'], capture_output=True, text=True, check=True, encoding='utf-8').stdout # fetch VRAM of devices getamdvram = subprocess.run(['rocm-smi', '--showmeminfo', 'vram', '--csv'], capture_output=True, text=True, check=True, encoding='utf-8').stdout # fetch VRAM of devices
FetchedCUdeviceMem = [line.split(",")[1].strip() for line in getamdvram.splitlines()[1:] if line.strip()] FetchedCUdeviceMem = [line.split(",")[1].strip() for line in getamdvram.splitlines()[1:] if line.strip()]
except Exception as e: except Exception as e:
pass pass
try: # Get Vulkan names try: # Get Vulkan names
output = run(['vulkaninfo','--summary'], capture_output=True, text=True, check=True, encoding='utf-8').stdout output = subprocess.run(['vulkaninfo','--summary'], capture_output=True, text=True, check=True, encoding='utf-8').stdout
devicelist = [line.split("=")[1].strip() for line in output.splitlines() if "deviceName" in line] devicelist = [line.split("=")[1].strip() for line in output.splitlines() if "deviceName" in line]
idx = 0 idx = 0
for dname in devicelist: for dname in devicelist:
@ -2061,7 +2061,7 @@ def show_new_gui():
quick_gpuname_label = ctk.CTkLabel(quick_tab, text="") quick_gpuname_label = ctk.CTkLabel(quick_tab, text="")
quick_gpuname_label.grid(row=3, column=1, padx=75, sticky="W") quick_gpuname_label.grid(row=3, column=1, padx=75, sticky="W")
quick_gpuname_label.configure(text_color="#ffff00") quick_gpuname_label.configure(text_color="#ffff00")
quick_gpu_layers_entry,quick_gpu_layers_label = makelabelentry(quick_tab,"GPU Layers:", gpulayers_var, 6, 50,"How many layers to offload onto the GPU.\nVRAM intensive, usage increases with model and context size.\nRequires some trial and error to find the best fit value.") quick_gpu_layers_entry,quick_gpu_layers_label = makelabelentry(quick_tab,"GPU Layers:", gpulayers_var, 6, 50,"How many layers to offload onto the GPU.\nVRAM intensive, usage increases with model and context size.\nRequires some trial and error to find the best fit value.\n\nCommon values for total layers, accuracy not guaranteed.\n\nLlama/Mistral 7b/8b: 33\nSolar 10.7b/11b: 49\nLlama 13b: 41\nLlama 20b(stack): 63\nLlama/Yi 34b: 61\nMixtral 8x7b: 33\nLlama 70b: 81")
quick_mmq_box = makecheckbox(quick_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1,tooltiptxt="Enable MMQ mode instead of CuBLAS for prompt processing. Read the wiki. Speed may vary.") quick_mmq_box = makecheckbox(quick_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1,tooltiptxt="Enable MMQ mode instead of CuBLAS for prompt processing. Read the wiki. Speed may vary.")
@ -2098,7 +2098,7 @@ def show_new_gui():
gpuname_label = ctk.CTkLabel(hardware_tab, text="") gpuname_label = ctk.CTkLabel(hardware_tab, text="")
gpuname_label.grid(row=3, column=1, padx=75, sticky="W") gpuname_label.grid(row=3, column=1, padx=75, sticky="W")
gpuname_label.configure(text_color="#ffff00") gpuname_label.configure(text_color="#ffff00")
gpu_layers_entry,gpu_layers_label = makelabelentry(hardware_tab,"GPU Layers:", gpulayers_var, 6, 50,"How many layers to offload onto the GPU.\nVRAM intensive, usage increases with model and context size.\nRequires some trial and error to find the best fit value.") gpu_layers_entry,gpu_layers_label = makelabelentry(hardware_tab,"GPU Layers:", gpulayers_var, 6, 50,"How many layers to offload onto the GPU.\nVRAM intensive, usage increases with model and context size.\nRequires some trial and error to find the best fit value.\n\nCommon values for total layers, accuracy not guaranteed.\n\nLlama/Mistral 7b/8b: 33\nSolar 10.7b/11b: 49\nLlama 13b: 41\nLlama 20b(stack): 63\nLlama/Yi 34b: 61\nMixtral 8x7b: 33\nLlama 70b: 81")
tensor_split_entry,tensor_split_label = makelabelentry(hardware_tab, "Tensor Split:", tensor_split_str_vars, 8, 80, tooltip='When using multiple GPUs this option controls how large tensors should be split across all GPUs.\nUses a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order.\nFor example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1.') tensor_split_entry,tensor_split_label = makelabelentry(hardware_tab, "Tensor Split:", tensor_split_str_vars, 8, 80, tooltip='When using multiple GPUs this option controls how large tensors should be split across all GPUs.\nUses a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order.\nFor example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1.')
lowvram_box = makecheckbox(hardware_tab, "Low VRAM (No KV offload)", lowvram_var, 4,0, tooltiptxt='Avoid offloading KV Cache or scratch buffers to VRAM.\nAllows more layers to fit, but may result in a speed loss.') lowvram_box = makecheckbox(hardware_tab, "Low VRAM (No KV offload)", lowvram_var, 4,0, tooltiptxt='Avoid offloading KV Cache or scratch buffers to VRAM.\nAllows more layers to fit, but may result in a speed loss.')
mmq_box = makecheckbox(hardware_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1, tooltiptxt="Enable MMQ mode to use finetuned kernels instead of default CuBLAS/HipBLAS for prompt processing.\nRead the wiki. Speed may vary.") mmq_box = makecheckbox(hardware_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1, tooltiptxt="Enable MMQ mode to use finetuned kernels instead of default CuBLAS/HipBLAS for prompt processing.\nRead the wiki. Speed may vary.")