mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge commit '1c641e6aac
' into concedo_experimental
# Conflicts: # .devops/cloud-v-pipeline # .devops/llama-cli-cuda.Dockerfile # .devops/llama-cli-rocm.Dockerfile # .devops/llama-cli-vulkan.Dockerfile # .devops/llama-cli.Dockerfile # .devops/llama-cpp-clblast.srpm.spec # .devops/llama-cpp-cuda.srpm.spec # .devops/llama-cpp.srpm.spec # .devops/llama-server-cuda.Dockerfile # .devops/llama-server-rocm.Dockerfile # .devops/llama-server-vulkan.Dockerfile # .devops/llama-server.Dockerfile # .devops/nix/apps.nix # .devops/nix/package.nix # .devops/tools.sh # .dockerignore # .github/ISSUE_TEMPLATE/01-bug-low.yml # .github/ISSUE_TEMPLATE/02-bug-medium.yml # .github/ISSUE_TEMPLATE/03-bug-high.yml # .github/ISSUE_TEMPLATE/04-bug-critical.yml # .github/workflows/bench.yml # .github/workflows/build.yml # .github/workflows/docker.yml # .github/workflows/server.yml # .gitignore # Makefile # README-sycl.md # README.md # ci/run.sh # docs/token_generation_performance_tips.md # flake.nix # grammars/README.md # pocs/vdot/CMakeLists.txt # scripts/get-hellaswag.sh # scripts/get-wikitext-103.sh # scripts/get-wikitext-2.sh # scripts/get-winogrande.sh # scripts/hf.sh # scripts/pod-llama.sh # scripts/qnt-all.sh # scripts/run-all-ppl.sh # scripts/run-with-preset.py # scripts/server-llm.sh # tests/test-backend-ops.cpp
This commit is contained in:
commit
b53e760557
94 changed files with 457 additions and 317 deletions
26
.devops/llama-cli-intel.Dockerfile
Normal file
26
.devops/llama-cli-intel.Dockerfile
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||||
|
|
||||||
|
ARG LLAMA_SYCL_F16=OFF
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y git
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
||||||
|
echo "LLAMA_SYCL_F16 is set" && \
|
||||||
|
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
||||||
|
fi && \
|
||||||
|
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
||||||
|
cmake --build build --config Release --target llama-cli
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||||
|
|
||||||
|
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/llama-cli" ]
|
29
.devops/llama-server-intel.Dockerfile
Normal file
29
.devops/llama-server-intel.Dockerfile
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||||
|
|
||||||
|
ARG LLAMA_SYCL_F16=OFF
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y git libcurl4-openssl-dev
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
||||||
|
echo "LLAMA_SYCL_F16 is set" && \
|
||||||
|
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
||||||
|
fi && \
|
||||||
|
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
||||||
|
cmake --build build --config Release --target llama-server
|
||||||
|
|
||||||
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y libcurl4-openssl-dev
|
||||||
|
|
||||||
|
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||||
|
|
||||||
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/llama-server" ]
|
|
@ -100,7 +100,7 @@ Have a look at existing implementation like `build_llama`, `build_dbrx` or `buil
|
||||||
|
|
||||||
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
|
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
|
||||||
|
|
||||||
Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
|
Note: to debug the inference graph: you can use [llama-eval-callback](../examples/eval-callback).
|
||||||
|
|
||||||
## GGUF specification
|
## GGUF specification
|
||||||
|
|
||||||
|
|
|
@ -13,42 +13,43 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
if (EMSCRIPTEN)
|
if (EMSCRIPTEN)
|
||||||
else()
|
else()
|
||||||
add_subdirectory(baby-llama)
|
add_subdirectory(baby-llama)
|
||||||
add_subdirectory(batched)
|
|
||||||
add_subdirectory(batched-bench)
|
add_subdirectory(batched-bench)
|
||||||
|
add_subdirectory(batched)
|
||||||
add_subdirectory(benchmark)
|
add_subdirectory(benchmark)
|
||||||
add_subdirectory(convert-llama2c-to-ggml)
|
add_subdirectory(convert-llama2c-to-ggml)
|
||||||
add_subdirectory(embedding)
|
add_subdirectory(embedding)
|
||||||
add_subdirectory(eval-callback)
|
add_subdirectory(eval-callback)
|
||||||
|
add_subdirectory(export-lora)
|
||||||
add_subdirectory(finetune)
|
add_subdirectory(finetune)
|
||||||
add_subdirectory(gritlm)
|
add_subdirectory(gbnf-validator)
|
||||||
add_subdirectory(gguf-split)
|
add_subdirectory(gguf-split)
|
||||||
|
add_subdirectory(gguf)
|
||||||
|
add_subdirectory(gritlm)
|
||||||
|
add_subdirectory(imatrix)
|
||||||
add_subdirectory(infill)
|
add_subdirectory(infill)
|
||||||
add_subdirectory(llama-bench)
|
add_subdirectory(llama-bench)
|
||||||
add_subdirectory(llava)
|
add_subdirectory(llava)
|
||||||
if (LLAMA_SYCL)
|
|
||||||
add_subdirectory(sycl)
|
|
||||||
endif()
|
|
||||||
add_subdirectory(main)
|
|
||||||
add_subdirectory(tokenize)
|
|
||||||
add_subdirectory(parallel)
|
|
||||||
add_subdirectory(perplexity)
|
|
||||||
add_subdirectory(quantize)
|
|
||||||
add_subdirectory(quantize-stats)
|
|
||||||
add_subdirectory(retrieval)
|
|
||||||
add_subdirectory(save-load-state)
|
|
||||||
add_subdirectory(simple)
|
|
||||||
add_subdirectory(passkey)
|
|
||||||
add_subdirectory(speculative)
|
|
||||||
add_subdirectory(lookahead)
|
add_subdirectory(lookahead)
|
||||||
add_subdirectory(lookup)
|
add_subdirectory(lookup)
|
||||||
add_subdirectory(gguf)
|
add_subdirectory(main)
|
||||||
add_subdirectory(train-text-from-scratch)
|
add_subdirectory(parallel)
|
||||||
add_subdirectory(imatrix)
|
add_subdirectory(passkey)
|
||||||
if (LLAMA_BUILD_SERVER)
|
add_subdirectory(perplexity)
|
||||||
add_subdirectory(server)
|
add_subdirectory(quantize-stats)
|
||||||
endif()
|
add_subdirectory(quantize)
|
||||||
add_subdirectory(export-lora)
|
add_subdirectory(retrieval)
|
||||||
if (LLAMA_RPC)
|
if (LLAMA_RPC)
|
||||||
add_subdirectory(rpc)
|
add_subdirectory(rpc)
|
||||||
endif()
|
endif()
|
||||||
|
if (LLAMA_BUILD_SERVER)
|
||||||
|
add_subdirectory(server)
|
||||||
|
endif()
|
||||||
|
if (LLAMA_SYCL)
|
||||||
|
add_subdirectory(sycl)
|
||||||
|
endif()
|
||||||
|
add_subdirectory(save-load-state)
|
||||||
|
add_subdirectory(simple)
|
||||||
|
add_subdirectory(speculative)
|
||||||
|
add_subdirectory(tokenize)
|
||||||
|
add_subdirectory(train-text-from-scratch)
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -22,7 +22,7 @@ if [ -n "$N_THREAD" ]; then
|
||||||
GEN_OPTIONS+=(--threads "$N_THREAD")
|
GEN_OPTIONS+=(--threads "$N_THREAD")
|
||||||
fi
|
fi
|
||||||
|
|
||||||
./main "${GEN_OPTIONS[@]}" \
|
./llama-cli "${GEN_OPTIONS[@]}" \
|
||||||
--model "$MODEL" \
|
--model "$MODEL" \
|
||||||
--in-prefix " " \
|
--in-prefix " " \
|
||||||
--in-suffix "${AI_NAME}:" \
|
--in-suffix "${AI_NAME}:" \
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET baby-llama)
|
set(TARGET llama-baby-llama)
|
||||||
add_executable(${TARGET} baby-llama.cpp)
|
add_executable(${TARGET} baby-llama.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -58,4 +58,4 @@ echo "$2
|
||||||
model=$1
|
model=$1
|
||||||
|
|
||||||
# generate the most likely continuation until the string "===" is found
|
# generate the most likely continuation until the string "===" is found
|
||||||
./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
|
./llama-cli -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET batched-bench)
|
set(TARGET llama-batched-bench)
|
||||||
add_executable(${TARGET} batched-bench.cpp)
|
add_executable(${TARGET} batched-bench.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -10,16 +10,16 @@ There are 2 modes of operation:
|
||||||
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
|
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
|
./llama-batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
|
||||||
|
|
||||||
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
|
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
|
||||||
./batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
|
./llama-batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
|
||||||
|
|
||||||
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
|
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
|
||||||
./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
|
./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
|
||||||
|
|
||||||
# custom set of batches
|
# custom set of batches
|
||||||
./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
|
./llama-batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
|
||||||
```
|
```
|
||||||
|
|
||||||
## Sample results
|
## Sample results
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
.PHONY: build
|
.PHONY: build
|
||||||
|
|
||||||
build:
|
build:
|
||||||
xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build
|
xcodebuild -scheme llama-batched-swift -destination "generic/platform=macOS" -derivedDataPath build
|
||||||
rm -f ./batched_swift
|
rm -f ./llama-batched-swift
|
||||||
ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift
|
ln -s ./build/Build/Products/Debug/llama-batched-swift ./llama-batched-swift
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
import PackageDescription
|
import PackageDescription
|
||||||
|
|
||||||
let package = Package(
|
let package = Package(
|
||||||
name: "batched_swift",
|
name: "llama-batched-swift",
|
||||||
platforms: [.macOS(.v12)],
|
platforms: [.macOS(.v12)],
|
||||||
dependencies: [
|
dependencies: [
|
||||||
.package(name: "llama", path: "../../"),
|
.package(name: "llama", path: "../../"),
|
||||||
|
@ -13,7 +13,7 @@ let package = Package(
|
||||||
// Targets are the basic building blocks of a package, defining a module or a test suite.
|
// Targets are the basic building blocks of a package, defining a module or a test suite.
|
||||||
// Targets can depend on other targets in this package and products from dependencies.
|
// Targets can depend on other targets in this package and products from dependencies.
|
||||||
.executableTarget(
|
.executableTarget(
|
||||||
name: "batched_swift",
|
name: "llama-batched-swift",
|
||||||
dependencies: ["llama"],
|
dependencies: ["llama"],
|
||||||
path: "Sources",
|
path: "Sources",
|
||||||
linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
|
linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
This is a swift clone of `examples/batched`.
|
This is a swift clone of `examples/batched`.
|
||||||
|
|
||||||
$ `make`
|
$ `make`
|
||||||
$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]`
|
$ `./llama-batched-swift MODEL_PATH [PROMPT] [PARALLEL]`
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET batched)
|
set(TARGET llama-batched)
|
||||||
add_executable(${TARGET} batched.cpp)
|
add_executable(${TARGET} batched.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
The example demonstrates batched generation from a given prompt
|
The example demonstrates batched generation from a given prompt
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
|
./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
|
||||||
|
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET benchmark)
|
set(TARGET llama-bench-matmult)
|
||||||
add_executable(${TARGET} benchmark-matmult.cpp)
|
add_executable(${TARGET} benchmark-matmult.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
|
||||||
$PROMPT_TEMPLATE > $PROMPT_FILE
|
$PROMPT_TEMPLATE > $PROMPT_FILE
|
||||||
|
|
||||||
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
||||||
./main $GEN_OPTIONS \
|
./llama-cli $GEN_OPTIONS \
|
||||||
--model "$MODEL" \
|
--model "$MODEL" \
|
||||||
--threads "$N_THREAD" \
|
--threads "$N_THREAD" \
|
||||||
--n_predict "$N_PREDICTS" \
|
--n_predict "$N_PREDICTS" \
|
||||||
|
|
|
@ -62,7 +62,7 @@ fi
|
||||||
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
|
if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
|
||||||
echo 'Prompt cache does not exist, building...'
|
echo 'Prompt cache does not exist, building...'
|
||||||
# Default batch_size to 64 here for better user feedback during initial prompt processing
|
# Default batch_size to 64 here for better user feedback during initial prompt processing
|
||||||
./main 2>>"$LOG" \
|
./llama-cli 2>>"$LOG" \
|
||||||
--batch_size 64 \
|
--batch_size 64 \
|
||||||
"${OPTS[@]}" \
|
"${OPTS[@]}" \
|
||||||
--prompt-cache "$PROMPT_CACHE_FILE" \
|
--prompt-cache "$PROMPT_CACHE_FILE" \
|
||||||
|
@ -109,13 +109,13 @@ while read -e line; do
|
||||||
|
|
||||||
printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
|
printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
|
||||||
|
|
||||||
./main 2>>"$LOG" "${OPTS[@]}" \
|
./llama-cli 2>>"$LOG" "${OPTS[@]}" \
|
||||||
--prompt-cache "$CUR_PROMPT_CACHE" \
|
--prompt-cache "$CUR_PROMPT_CACHE" \
|
||||||
--prompt-cache-all \
|
--prompt-cache-all \
|
||||||
--file "$CUR_PROMPT_FILE" \
|
--file "$CUR_PROMPT_FILE" \
|
||||||
--reverse-prompt "${USER_NAME}:" \
|
--reverse-prompt "${USER_NAME}:" \
|
||||||
--n_predict "$n_predict" |
|
--n_predict "$n_predict" |
|
||||||
skip_bytes 1 | # skip BOS token added by ./main
|
skip_bytes 1 | # skip BOS token added by ./llama-cli
|
||||||
tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file
|
tee "$CUR_PROMPT_FILE.tmp" | # save prompt + generation to tmp file
|
||||||
skip_bytes "$n_prompt_len_pre" # print generation
|
skip_bytes "$n_prompt_len_pre" # print generation
|
||||||
|
|
||||||
|
@ -133,7 +133,7 @@ while read -e line; do
|
||||||
# TODO get both messages in one go
|
# TODO get both messages in one go
|
||||||
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
|
if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
|
||||||
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
|
! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
|
||||||
echo >&2 "Couldn't get number of tokens from ./main output!"
|
echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -144,7 +144,7 @@ while read -e line; do
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Update cache for next prompt in background, ideally during user input
|
# Update cache for next prompt in background, ideally during user input
|
||||||
./main >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
|
./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
|
||||||
--prompt-cache "$NEXT_PROMPT_CACHE" \
|
--prompt-cache "$NEXT_PROMPT_CACHE" \
|
||||||
--file "$NEXT_PROMPT_FILE" \
|
--file "$NEXT_PROMPT_FILE" \
|
||||||
--n_predict 1 &
|
--n_predict 1 &
|
||||||
|
|
|
@ -30,7 +30,7 @@ sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
|
||||||
$PROMPT_TEMPLATE > $PROMPT_FILE
|
$PROMPT_TEMPLATE > $PROMPT_FILE
|
||||||
|
|
||||||
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
||||||
./bin/main $GEN_OPTIONS \
|
./bin/llama-cli $GEN_OPTIONS \
|
||||||
--model "$MODEL" \
|
--model "$MODEL" \
|
||||||
--threads "$N_THREAD" \
|
--threads "$N_THREAD" \
|
||||||
--n_predict "$N_PREDICTS" \
|
--n_predict "$N_PREDICTS" \
|
||||||
|
|
|
@ -11,6 +11,6 @@ cd ..
|
||||||
#
|
#
|
||||||
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt
|
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt
|
||||||
#
|
#
|
||||||
./main -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
|
./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
|
||||||
--repeat_penalty 1.0 --color -i \
|
--repeat_penalty 1.0 --color -i \
|
||||||
-r "User:" -f prompts/chat-with-bob.txt
|
-r "User:" -f prompts/chat-with-bob.txt
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET convert-llama2c-to-ggml)
|
set(TARGET llama-convert-llama2c-to-ggml)
|
||||||
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
|
add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -8,7 +8,7 @@ To convert the model first download the models from the [llama2.c](https://githu
|
||||||
|
|
||||||
After successful compilation, following usage options are available:
|
After successful compilation, following usage options are available:
|
||||||
```
|
```
|
||||||
usage: ./convert-llama2c-to-ggml [options]
|
usage: ./llama-convert-llama2c-to-ggml [options]
|
||||||
|
|
||||||
options:
|
options:
|
||||||
-h, --help show this help message and exit
|
-h, --help show this help message and exit
|
||||||
|
@ -19,10 +19,10 @@ options:
|
||||||
|
|
||||||
An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
|
An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
|
||||||
|
|
||||||
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
|
`$ ./llama-convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
|
||||||
|
|
||||||
Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
|
Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
|
||||||
|
|
||||||
Now you can use the model with a command like:
|
Now you can use the model with a command like:
|
||||||
|
|
||||||
`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
|
`$ ./llama-cli -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET embedding)
|
set(TARGET llama-embedding)
|
||||||
add_executable(${TARGET} embedding.cpp)
|
add_executable(${TARGET} embedding.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -9,13 +9,13 @@ To get started right away, run the following command, making sure to use the cor
|
||||||
### Unix-based systems (Linux, macOS, etc.):
|
### Unix-based systems (Linux, macOS, etc.):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
|
./llama-embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
|
||||||
```
|
```
|
||||||
|
|
||||||
### Windows:
|
### Windows:
|
||||||
|
|
||||||
```powershell
|
```powershell
|
||||||
embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
|
llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
|
||||||
```
|
```
|
||||||
|
|
||||||
The above command will output space-separated float values.
|
The above command will output space-separated float values.
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
set(TARGET eval-callback)
|
set(TARGET llama-eval-callback)
|
||||||
add_executable(${TARGET} eval-callback.cpp)
|
add_executable(${TARGET} eval-callback.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
||||||
set(TEST_TARGET test-eval-callback)
|
set(TEST_TARGET test-eval-callback)
|
||||||
add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
||||||
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
|
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
|
||||||
|
|
|
@ -6,7 +6,7 @@ It simply prints to the console all operations and tensor data.
|
||||||
Usage:
|
Usage:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
eval-callback \
|
llama-eval-callback \
|
||||||
--hf-repo ggml-org/models \
|
--hf-repo ggml-org/models \
|
||||||
--hf-file phi-2/ggml-model-q4_0.gguf \
|
--hf-file phi-2/ggml-model-q4_0.gguf \
|
||||||
--model phi-2-q4_0.gguf \
|
--model phi-2-q4_0.gguf \
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET export-lora)
|
set(TARGET llama-export-lora)
|
||||||
add_executable(${TARGET} export-lora.cpp)
|
add_executable(${TARGET} export-lora.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
Apply LORA adapters to base model and export the resulting model.
|
Apply LORA adapters to base model and export the resulting model.
|
||||||
|
|
||||||
```
|
```
|
||||||
usage: export-lora [options]
|
usage: llama-export-lora [options]
|
||||||
|
|
||||||
options:
|
options:
|
||||||
-h, --help show this help message and exit
|
-h, --help show this help message and exit
|
||||||
|
@ -17,7 +17,7 @@ options:
|
||||||
For example:
|
For example:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./bin/export-lora \
|
./bin/llama-export-lora \
|
||||||
-m open-llama-3b-v2-q8_0.gguf \
|
-m open-llama-3b-v2-q8_0.gguf \
|
||||||
-o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
|
-o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
|
||||||
-l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
|
-l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET finetune)
|
set(TARGET llama-finetune)
|
||||||
add_executable(${TARGET} finetune.cpp)
|
add_executable(${TARGET} finetune.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -7,7 +7,7 @@ Basic usage instructions:
|
||||||
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
|
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
|
||||||
|
|
||||||
# finetune LORA adapter
|
# finetune LORA adapter
|
||||||
./bin/finetune \
|
./bin/llama-finetune \
|
||||||
--model-base open-llama-3b-v2-q8_0.gguf \
|
--model-base open-llama-3b-v2-q8_0.gguf \
|
||||||
--checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
|
--checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
|
||||||
--checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
|
--checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
|
||||||
|
@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
|
||||||
--use-checkpointing
|
--use-checkpointing
|
||||||
|
|
||||||
# predict
|
# predict
|
||||||
./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
||||||
```
|
```
|
||||||
|
|
||||||
**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
|
**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
|
||||||
|
@ -38,14 +38,14 @@ After 10 more iterations:
|
||||||
Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
|
Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
|
||||||
|
|
||||||
llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
|
llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
|
||||||
These LORA adapters can then be used by `main` together with the base model, like in the 'predict' example command above.
|
These LORA adapters can then be used by `llama-cli` together with the base model, like in the 'predict' example command above.
|
||||||
|
|
||||||
In `main` you can also load multiple LORA adapters, which will then be mixed together.
|
In `llama-cli` you can also load multiple LORA adapters, which will then be mixed together.
|
||||||
|
|
||||||
For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
|
For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./bin/main -m open-llama-3b-v2-q8_0.gguf \
|
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
|
||||||
--lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
|
--lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
|
||||||
--lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
|
--lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
|
||||||
```
|
```
|
||||||
|
@ -55,7 +55,7 @@ You can change how strong each LORA adapter is applied to the base model by usin
|
||||||
For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
|
For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./bin/main -m open-llama-3b-v2-q8_0.gguf \
|
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
|
||||||
--lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
|
--lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
|
||||||
--lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
|
--lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
|
||||||
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
|
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
cd `dirname $0`
|
cd `dirname $0`
|
||||||
cd ../..
|
cd ../..
|
||||||
|
|
||||||
EXE="./finetune"
|
EXE="./llama-finetune"
|
||||||
|
|
||||||
if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
|
if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
|
||||||
if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
|
if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
set(TARGET gbnf-validator)
|
set(TARGET llama-gbnf-validator)
|
||||||
add_executable(${TARGET} gbnf-validator.cpp)
|
add_executable(${TARGET} gbnf-validator.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common grammar-parser llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
|
@ -7,6 +7,8 @@
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
#include <sstream>
|
||||||
|
#include <fstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
@ -69,13 +71,14 @@ int main(int argc, char** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
fseek(grammar_file, 0, SEEK_END);
|
std::string grammar_str;
|
||||||
size_t grammar_size = ftell(grammar_file);
|
{
|
||||||
fseek(grammar_file, 0, SEEK_SET);
|
std::ifstream grammar_file(grammar_filename);
|
||||||
|
GGML_ASSERT(grammar_file.is_open() && "Failed to open grammar file");
|
||||||
std::string grammar_str(grammar_size, ' ');
|
std::stringstream buffer;
|
||||||
fread(&grammar_str[0], 1, grammar_size, grammar_file);
|
buffer << grammar_file.rdbuf();
|
||||||
fclose(grammar_file);
|
grammar_str = buffer.str();
|
||||||
|
}
|
||||||
|
|
||||||
// Parse the GBNF grammar
|
// Parse the GBNF grammar
|
||||||
auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
|
auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
|
||||||
|
@ -100,20 +103,15 @@ int main(int argc, char** argv) {
|
||||||
grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
||||||
|
|
||||||
// Read the input file
|
// Read the input file
|
||||||
FILE* input_file = fopen(input_filename.c_str(), "r");
|
std::string input_str;
|
||||||
if (!input_file) {
|
{
|
||||||
fprintf(stdout, "Failed to open input file: %s\n", input_filename.c_str());
|
std::ifstream input_file(input_filename);
|
||||||
return 1;
|
GGML_ASSERT(input_file.is_open() && "Failed to open input file");
|
||||||
|
std::stringstream buffer;
|
||||||
|
buffer << input_file.rdbuf();
|
||||||
|
input_str = buffer.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
fseek(input_file, 0, SEEK_END);
|
|
||||||
size_t input_size = ftell(input_file);
|
|
||||||
fseek(input_file, 0, SEEK_SET);
|
|
||||||
|
|
||||||
std::string input_str(input_size, ' ');
|
|
||||||
fread(&input_str[0], 1, input_size, input_file);
|
|
||||||
fclose(input_file);
|
|
||||||
|
|
||||||
// Validate the input string against the grammar
|
// Validate the input string against the grammar
|
||||||
size_t error_pos;
|
size_t error_pos;
|
||||||
std::string error_msg;
|
std::string error_msg;
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET gguf-split)
|
set(TARGET llama-gguf-split)
|
||||||
add_executable(${TARGET} gguf-split.cpp)
|
add_executable(${TARGET} gguf-split.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -18,8 +18,8 @@ fi
|
||||||
|
|
||||||
set -x
|
set -x
|
||||||
|
|
||||||
SPLIT=$1/gguf-split
|
SPLIT=$1/llama-gguf-split
|
||||||
MAIN=$1/main
|
MAIN=$1/llama-cli
|
||||||
WORK_PATH=$TMP_DIR/gguf-split
|
WORK_PATH=$TMP_DIR/gguf-split
|
||||||
ROOT_DIR=$(realpath $(dirname $0)/../../)
|
ROOT_DIR=$(realpath $(dirname $0)/../../)
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET gguf)
|
set(TARGET llama-gguf)
|
||||||
add_executable(${TARGET} gguf.cpp)
|
add_executable(${TARGET} gguf.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET gritlm)
|
set(TARGET llama-gritlm)
|
||||||
add_executable(${TARGET} gritlm.cpp)
|
add_executable(${TARGET} gritlm.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -26,7 +26,7 @@ $ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --ou
|
||||||
|
|
||||||
Run the example using the downloaded model:
|
Run the example using the downloaded model:
|
||||||
```console
|
```console
|
||||||
$ ./gritlm -m models/gritlm-7b_q4_1.gguf
|
$ ./llama-gritlm -m models/gritlm-7b_q4_1.gguf
|
||||||
|
|
||||||
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
|
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
|
||||||
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
|
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET imatrix)
|
set(TARGET llama-imatrix)
|
||||||
add_executable(${TARGET} imatrix.cpp)
|
add_executable(${TARGET} imatrix.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -6,7 +6,7 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
```
|
```
|
||||||
./imatrix \
|
./llama-imatrix \
|
||||||
-m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
|
-m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
|
||||||
[--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
|
[--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
|
||||||
[--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
|
[--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
|
||||||
|
@ -28,8 +28,8 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
|
||||||
LLAMA_CUDA=1 make -j
|
LLAMA_CUDA=1 make -j
|
||||||
|
|
||||||
# generate importance matrix (imatrix.dat)
|
# generate importance matrix (imatrix.dat)
|
||||||
./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
|
./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
|
||||||
|
|
||||||
# use the imatrix to perform a Q4_K_M quantization
|
# use the imatrix to perform a Q4_K_M quantization
|
||||||
./quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
|
./llama-quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
|
||||||
```
|
```
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET infill)
|
set(TARGET llama-infill)
|
||||||
add_executable(${TARGET} infill.cpp)
|
add_executable(${TARGET} infill.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -42,5 +42,5 @@ scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.ggu
|
||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
|
./llama-infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n "
|
||||||
```
|
```
|
||||||
|
|
|
@ -21,7 +21,7 @@ counter=1
|
||||||
echo 'Running'
|
echo 'Running'
|
||||||
while IFS= read -r question
|
while IFS= read -r question
|
||||||
do
|
do
|
||||||
exe_cmd="./main -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
|
exe_cmd="./llama-cli -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
|
||||||
echo $counter
|
echo $counter
|
||||||
echo "Current Question: $question"
|
echo "Current Question: $question"
|
||||||
eval "$exe_cmd"
|
eval "$exe_cmd"
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# Usage:
|
# Usage:
|
||||||
#! ./server -m some-model.gguf &
|
#! ./llama-server -m some-model.gguf &
|
||||||
#! pip install pydantic
|
#! pip install pydantic
|
||||||
#! python json-schema-pydantic-example.py
|
#! python json-schema-pydantic-example.py
|
||||||
|
|
||||||
|
|
|
@ -523,7 +523,7 @@ class SchemaConverter:
|
||||||
def main(args_in = None):
|
def main(args_in = None):
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description='''
|
description='''
|
||||||
Generates a grammar (suitable for use in ./main) that produces JSON conforming to a
|
Generates a grammar (suitable for use in ./llama-cli) that produces JSON conforming to a
|
||||||
given JSON schema. Only a subset of JSON schema features are supported; more may be
|
given JSON schema. Only a subset of JSON schema features are supported; more may be
|
||||||
added in the future.
|
added in the future.
|
||||||
''',
|
''',
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# llama.cpp/example/llama-bench
|
# llama.cpp/examples/llama-bench
|
||||||
|
|
||||||
Performance testing tool for llama.cpp.
|
Performance testing tool for llama.cpp.
|
||||||
|
|
||||||
|
|
|
@ -30,8 +30,9 @@ if(TARGET BUILD_INFO)
|
||||||
add_dependencies(llava BUILD_INFO)
|
add_dependencies(llava BUILD_INFO)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(TARGET llava-cli)
|
set(TARGET llama-llava-cli)
|
||||||
add_executable(llava-cli llava-cli.cpp)
|
add_executable(${TARGET} llava-cli.cpp)
|
||||||
install(TARGETS llava-cli RUNTIME)
|
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
|
||||||
target_link_libraries(llava-cli PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_compile_features(llava PRIVATE cxx_std_11)
|
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
|
@ -9,12 +9,12 @@ The implementation is based on llava, and is compatible with llava and mobileVLM
|
||||||
Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.
|
Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
Build with cmake or run `make llava-cli` to build it.
|
Build with cmake or run `make llama-llava-cli` to build it.
|
||||||
|
|
||||||
After building, run: `./llava-cli` to see the usage. For example:
|
After building, run: `./llama-llava-cli` to see the usage. For example:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
./llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
|
./llama-llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
|
||||||
--mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
|
--mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
|
||||||
--image path/to/an/image.jpg \
|
--image path/to/an/image.jpg \
|
||||||
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
|
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
|
||||||
|
@ -62,7 +62,7 @@ python ./examples/convert-legacy-llama.py path/to/MobileVLM-1.7B
|
||||||
|
|
||||||
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
|
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
|
||||||
```sh
|
```sh
|
||||||
./quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
|
./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
|
||||||
```
|
```
|
||||||
|
|
||||||
Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
|
Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
|
||||||
|
@ -82,7 +82,7 @@ refer to `android/adb_run.sh`, modify resources' `name` and `path`
|
||||||
### case 1
|
### case 1
|
||||||
**input**
|
**input**
|
||||||
```sh
|
```sh
|
||||||
/data/local/tmp/llava-cli \
|
/data/local/tmp/llama-llava-cli \
|
||||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||||
-t 4 \
|
-t 4 \
|
||||||
|
@ -102,7 +102,7 @@ llama_print_timings: total time = 34731.93 ms
|
||||||
### case 2
|
### case 2
|
||||||
**input**
|
**input**
|
||||||
```sh
|
```sh
|
||||||
/data/local/tmp/llava-cli \
|
/data/local/tmp/llama-llava-cli \
|
||||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||||
-t 4 \
|
-t 4 \
|
||||||
|
@ -126,7 +126,7 @@ llama_print_timings: total time = 34570.79 ms
|
||||||
#### llava-cli release-b2005
|
#### llava-cli release-b2005
|
||||||
**input**
|
**input**
|
||||||
```sh
|
```sh
|
||||||
/data/local/tmp/llava-cli \
|
/data/local/tmp/llama-llava-cli \
|
||||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||||
-t 4 \
|
-t 4 \
|
||||||
|
@ -200,7 +200,7 @@ make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
|
||||||
### case 1
|
### case 1
|
||||||
**input**
|
**input**
|
||||||
```sh
|
```sh
|
||||||
./llava-cli \
|
./llama-llava-cli \
|
||||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||||
--image /data/local/tmp/demo.jpeg \
|
--image /data/local/tmp/demo.jpeg \
|
||||||
|
@ -224,7 +224,7 @@ llama_print_timings: total time = 1352.63 ms / 252 tokens
|
||||||
### case 2
|
### case 2
|
||||||
**input**
|
**input**
|
||||||
```sh
|
```sh
|
||||||
./llava-cli \
|
./llama-llava-cli \
|
||||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||||
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \
|
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \
|
||||||
|
|
|
@ -11,12 +11,12 @@ For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](h
|
||||||
After API is confirmed, more models will be supported / uploaded.
|
After API is confirmed, more models will be supported / uploaded.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
Build with cmake or run `make llava-cli` to build it.
|
Build with cmake or run `make llama-llava-cli` to build it.
|
||||||
|
|
||||||
After building, run: `./llava-cli` to see the usage. For example:
|
After building, run: `./llama-llava-cli` to see the usage. For example:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
./llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
|
./llama-llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
|
||||||
```
|
```
|
||||||
|
|
||||||
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
|
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
|
||||||
|
@ -95,9 +95,9 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projecto
|
||||||
python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
|
python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
|
||||||
```
|
```
|
||||||
|
|
||||||
7) And finally we can run the llava-cli using the 1.6 model version:
|
7) And finally we can run the llava cli using the 1.6 model version:
|
||||||
```console
|
```console
|
||||||
./llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
|
./llama-llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
|
||||||
```
|
```
|
||||||
|
|
||||||
**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
|
**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
|
||||||
|
|
|
@ -10,7 +10,7 @@ prompt="A chat between a curious user and an artificial intelligence assistant.
|
||||||
# prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
|
# prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
|
||||||
|
|
||||||
program_dir="build_64/bin"
|
program_dir="build_64/bin"
|
||||||
binName="llava-cli"
|
binName="llama-llava-cli"
|
||||||
n_threads=4
|
n_threads=4
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET lookahead)
|
set(TARGET llama-lookahead)
|
||||||
add_executable(${TARGET} lookahead.cpp)
|
add_executable(${TARGET} lookahead.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -1,22 +1,22 @@
|
||||||
set(TARGET lookup)
|
set(TARGET llama-lookup)
|
||||||
add_executable(${TARGET} lookup.cpp)
|
add_executable(${TARGET} lookup.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
||||||
set(TARGET lookup-create)
|
set(TARGET llama-lookup-create)
|
||||||
add_executable(${TARGET} lookup-create.cpp)
|
add_executable(${TARGET} lookup-create.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
||||||
set(TARGET lookup-merge)
|
set(TARGET llama-lookup-merge)
|
||||||
add_executable(${TARGET} lookup-merge.cpp)
|
add_executable(${TARGET} lookup-merge.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
||||||
set(TARGET lookup-stats)
|
set(TARGET llama-lookup-stats)
|
||||||
add_executable(${TARGET} lookup-stats.cpp)
|
add_executable(${TARGET} lookup-stats.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -11,14 +11,14 @@
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
static void print_usage() {
|
static void print_usage(char* argv0) {
|
||||||
fprintf(stderr, "Merges multiple lookup cache files into a single one.\n");
|
fprintf(stderr, "Merges multiple lookup cache files into a single one.\n");
|
||||||
fprintf(stderr, "Usage: lookup-merge [--help] lookup_part_1.bin lookup_part_2.bin ... lookup_merged.bin\n");
|
fprintf(stderr, "Usage: %s [--help] lookup_part_1.bin lookup_part_2.bin ... lookup_merged.bin\n", argv0);
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv){
|
int main(int argc, char ** argv){
|
||||||
if (argc < 3) {
|
if (argc < 3) {
|
||||||
print_usage();
|
print_usage(argv[0]);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ int main(int argc, char ** argv){
|
||||||
for (int i = 0; i < argc-1; ++i) {
|
for (int i = 0; i < argc-1; ++i) {
|
||||||
args[i] = argv[i+1];
|
args[i] = argv[i+1];
|
||||||
if (args[i] == "-h" || args[i] == "--help") {
|
if (args[i] == "-h" || args[i] == "--help") {
|
||||||
print_usage();
|
print_usage(argv[0]);
|
||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,12 +1,12 @@
|
||||||
cmake_minimum_required(VERSION 3.12)
|
cmake_minimum_required(VERSION 3.12)
|
||||||
project("main-cmake-pkg" C CXX)
|
project("llama-cli-cmake-pkg" C CXX)
|
||||||
set(TARGET main-cmake-pkg)
|
set(TARGET llama-cli-cmake-pkg)
|
||||||
|
|
||||||
find_package(Llama 0.0.1 REQUIRED)
|
find_package(Llama 0.0.1 REQUIRED)
|
||||||
|
|
||||||
# Bake common functionality in with target. Because applications
|
# Bake common functionality in with target. Because applications
|
||||||
# using the relocatable Llama package should be outside of the
|
# using the relocatable Llama package should be outside of the
|
||||||
# source tree, main-cmake-pkg pretends the dependencies are built-in.
|
# source tree, llama-cli-cmake-pkg pretends the dependencies are built-in.
|
||||||
set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common")
|
set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common")
|
||||||
add_library(common OBJECT)
|
add_library(common OBJECT)
|
||||||
file(GLOB _common_files
|
file(GLOB _common_files
|
||||||
|
@ -15,7 +15,7 @@ file(GLOB _common_files
|
||||||
)
|
)
|
||||||
target_sources(common PRIVATE ${_common_files})
|
target_sources(common PRIVATE ${_common_files})
|
||||||
|
|
||||||
# If the common project was part of "main-cmake-pkg" the transient
|
# If the common project was part of "llama-cli-cmake-pkg" the transient
|
||||||
# defines would automatically be attached. Because the common func-
|
# defines would automatically be attached. Because the common func-
|
||||||
# tionality is separate, but dependent upon the defines, it must be
|
# tionality is separate, but dependent upon the defines, it must be
|
||||||
# explicitly extracted from the "llama" target.
|
# explicitly extracted from the "llama" target.
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# llama.cpp/example/main-cmake-pkg
|
# llama.cpp/example/main-cmake-pkg
|
||||||
|
|
||||||
This program builds the [main](../main) application using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
|
This program builds [llama-cli](../main) using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
|
||||||
|
|
||||||
## Building
|
## Building
|
||||||
|
|
||||||
|
@ -22,7 +22,7 @@ cmake --build build --config Release
|
||||||
cmake --install build --prefix C:/LlamaCPP
|
cmake --install build --prefix C:/LlamaCPP
|
||||||
```
|
```
|
||||||
|
|
||||||
### Build main-cmake-pkg
|
### Build llama-cli-cmake-pkg
|
||||||
|
|
||||||
|
|
||||||
```cmd
|
```cmd
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET main)
|
set(TARGET llama-cli)
|
||||||
add_executable(${TARGET} main.cpp)
|
add_executable(${TARGET} main.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# llama.cpp/example/main
|
# llama.cpp/examples/main
|
||||||
|
|
||||||
This example program allows you to use various LLaMA language models in an easy and efficient way. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
|
This example program allows you to use various LLaMA language models in an easy and efficient way. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
|
||||||
|
|
||||||
|
@ -20,13 +20,13 @@ To get started right away, run the following command, making sure to use the cor
|
||||||
#### Unix-based systems (Linux, macOS, etc.):
|
#### Unix-based systems (Linux, macOS, etc.):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./main -m models/7B/ggml-model.bin --prompt "Once upon a time"
|
./llama-cli -m models/7B/ggml-model.bin --prompt "Once upon a time"
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Windows:
|
#### Windows:
|
||||||
|
|
||||||
```powershell
|
```powershell
|
||||||
main.exe -m models\7B\ggml-model.bin --prompt "Once upon a time"
|
llama-cli.exe -m models\7B\ggml-model.bin --prompt "Once upon a time"
|
||||||
```
|
```
|
||||||
|
|
||||||
For an interactive experience, try this command:
|
For an interactive experience, try this command:
|
||||||
|
@ -34,7 +34,7 @@ For an interactive experience, try this command:
|
||||||
#### Unix-based systems (Linux, macOS, etc.):
|
#### Unix-based systems (Linux, macOS, etc.):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
|
./llama-cli -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
|
||||||
'User: Hi
|
'User: Hi
|
||||||
AI: Hello. I am an AI chatbot. Would you like to talk?
|
AI: Hello. I am an AI chatbot. Would you like to talk?
|
||||||
User: Sure!
|
User: Sure!
|
||||||
|
@ -45,7 +45,7 @@ User:'
|
||||||
#### Windows:
|
#### Windows:
|
||||||
|
|
||||||
```powershell
|
```powershell
|
||||||
main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
|
llama-cli.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
|
||||||
```
|
```
|
||||||
|
|
||||||
The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
|
The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
|
||||||
|
@ -53,18 +53,18 @@ The following command generates "infinite" text from a starting prompt (you can
|
||||||
#### Unix-based systems (Linux, macOS, etc.):
|
#### Unix-based systems (Linux, macOS, etc.):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./main -m models/7B/ggml-model.bin --ignore-eos -n -1
|
./llama-cli -m models/7B/ggml-model.bin --ignore-eos -n -1
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Windows:
|
#### Windows:
|
||||||
|
|
||||||
```powershell
|
```powershell
|
||||||
main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1
|
llama-cli.exe -m models\7B\ggml-model.bin --ignore-eos -n -1
|
||||||
```
|
```
|
||||||
|
|
||||||
## Common Options
|
## Common Options
|
||||||
|
|
||||||
In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
|
In this section, we cover the most commonly used options for running the `llama-cli` program with the LLaMA models:
|
||||||
|
|
||||||
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`; inferred from `--model-url` if set).
|
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`; inferred from `--model-url` if set).
|
||||||
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
|
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
|
||||||
|
@ -74,7 +74,7 @@ In this section, we cover the most commonly used options for running the `main`
|
||||||
|
|
||||||
## Input Prompts
|
## Input Prompts
|
||||||
|
|
||||||
The `main` program provides several ways to interact with the LLaMA models using input prompts:
|
The `llama-cli` program provides several ways to interact with the LLaMA models using input prompts:
|
||||||
|
|
||||||
- `--prompt PROMPT`: Provide a prompt directly as a command-line option.
|
- `--prompt PROMPT`: Provide a prompt directly as a command-line option.
|
||||||
- `--file FNAME`: Provide a file containing a prompt or multiple prompts.
|
- `--file FNAME`: Provide a file containing a prompt or multiple prompts.
|
||||||
|
@ -82,7 +82,7 @@ The `main` program provides several ways to interact with the LLaMA models using
|
||||||
|
|
||||||
## Interaction
|
## Interaction
|
||||||
|
|
||||||
The `main` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive` and `--interactive-first`.
|
The `llama-cli` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive` and `--interactive-first`.
|
||||||
|
|
||||||
In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing.
|
In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing.
|
||||||
|
|
||||||
|
@ -107,7 +107,7 @@ To overcome this limitation, you can use the `--in-prefix` flag to add a space o
|
||||||
The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag:
|
The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
./main -r "User:" --in-prefix " "
|
./llama-cli -r "User:" --in-prefix " "
|
||||||
```
|
```
|
||||||
|
|
||||||
### In-Suffix
|
### In-Suffix
|
||||||
|
@ -115,7 +115,7 @@ The `--in-prefix` flag is used to add a prefix to your input, primarily, this is
|
||||||
The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag:
|
The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
./main -r "User:" --in-prefix " " --in-suffix "Assistant:"
|
./llama-cli -r "User:" --in-prefix " " --in-suffix "Assistant:"
|
||||||
```
|
```
|
||||||
|
|
||||||
## Context Management
|
## Context Management
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET parallel)
|
set(TARGET llama-parallel)
|
||||||
add_executable(${TARGET} parallel.cpp)
|
add_executable(${TARGET} parallel.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET passkey)
|
set(TARGET llama-passkey)
|
||||||
add_executable(${TARGET} passkey.cpp)
|
add_executable(${TARGET} passkey.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -8,5 +8,5 @@ See the following PRs for more info:
|
||||||
### Usage
|
### Usage
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
make -j && ./passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250
|
make -j && ./llama-passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250
|
||||||
```
|
```
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET perplexity)
|
set(TARGET llama-perplexity)
|
||||||
add_executable(${TARGET} perplexity.cpp)
|
add_executable(${TARGET} perplexity.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -477,7 +477,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||||
}
|
}
|
||||||
|
|
||||||
// Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
// Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||||
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
// Run `./llama-perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||||
// Output: `perplexity: 13.5106 [114/114]`
|
// Output: `perplexity: 13.5106 [114/114]`
|
||||||
// BOS tokens will be added for each chunk before eval
|
// BOS tokens will be added for each chunk before eval
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET quantize-stats)
|
set(TARGET llama-quantize-stats)
|
||||||
add_executable(${TARGET} quantize-stats.cpp)
|
add_executable(${TARGET} quantize-stats.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET quantize)
|
set(TARGET llama-quantize)
|
||||||
add_executable(${TARGET} quantize.cpp)
|
add_executable(${TARGET} quantize.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -18,9 +18,9 @@ fi
|
||||||
|
|
||||||
set -x
|
set -x
|
||||||
|
|
||||||
SPLIT=$1/gguf-split
|
SPLIT=$1/llama-gguf-split
|
||||||
QUANTIZE=$1/quantize
|
QUANTIZE=$1/llama-quantize
|
||||||
MAIN=$1/main
|
MAIN=$1/llama-cli
|
||||||
WORK_PATH=$TMP_DIR/quantize
|
WORK_PATH=$TMP_DIR/quantize
|
||||||
ROOT_DIR=$(realpath $(dirname $0)/../../)
|
ROOT_DIR=$(realpath $(dirname $0)/../../)
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ if [ "$1" == "-m" ]; then
|
||||||
MODEL="-m $2 "
|
MODEL="-m $2 "
|
||||||
fi
|
fi
|
||||||
|
|
||||||
./main $MODEL --color \
|
./llama-cli $MODEL --color \
|
||||||
-f ./prompts/reason-act.txt \
|
-f ./prompts/reason-act.txt \
|
||||||
-i --interactive-first \
|
-i --interactive-first \
|
||||||
--top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \
|
--top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET retrieval)
|
set(TARGET llama-retrieval)
|
||||||
add_executable(${TARGET} retrieval.cpp)
|
add_executable(${TARGET} retrieval.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -15,7 +15,7 @@ https://github.com/ggerganov/llama.cpp/pull/6193
|
||||||
`retrieval` example can be tested as follows:
|
`retrieval` example can be tested as follows:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
make -j && ./retrieval --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .
|
make -j && ./llama-retrieval --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .
|
||||||
```
|
```
|
||||||
|
|
||||||
This chunks and embeds all given files and starts a loop requesting query inputs:
|
This chunks and embeds all given files and starts a loop requesting query inputs:
|
||||||
|
|
|
@ -70,5 +70,5 @@ cmake --build . --config Release
|
||||||
Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
|
Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ bin/main -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
|
$ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
|
||||||
```
|
```
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET save-load-state)
|
set(TARGET llama-save-load-state)
|
||||||
add_executable(${TARGET} save-load-state.cpp)
|
add_executable(${TARGET} save-load-state.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -16,7 +16,7 @@ GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}"
|
||||||
|
|
||||||
|
|
||||||
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
||||||
./server $GEN_OPTIONS \
|
./llama-server $GEN_OPTIONS \
|
||||||
--model "$MODEL" \
|
--model "$MODEL" \
|
||||||
--threads "$N_THREAD" \
|
--threads "$N_THREAD" \
|
||||||
--rope-freq-scale 1.0 \
|
--rope-freq-scale 1.0 \
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET server)
|
set(TARGET llama-server)
|
||||||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
||||||
option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
|
option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
|
||||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
|
||||||
|
|
|
@ -80,26 +80,26 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
|
|
||||||
## Build
|
## Build
|
||||||
|
|
||||||
`server` is built alongside everything else from the root of the project
|
`llama-server` is built alongside everything else from the root of the project
|
||||||
|
|
||||||
- Using `make`:
|
- Using `make`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
make server
|
make llama-server
|
||||||
```
|
```
|
||||||
|
|
||||||
- Using `CMake`:
|
- Using `CMake`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cmake -B build
|
cmake -B build
|
||||||
cmake --build build --config Release -t server
|
cmake --build build --config Release -t llama-server
|
||||||
```
|
```
|
||||||
|
|
||||||
Binary is at `./build/bin/server`
|
Binary is at `./build/bin/llama-server`
|
||||||
|
|
||||||
## Build with SSL
|
## Build with SSL
|
||||||
|
|
||||||
`server` can also be built with SSL support using OpenSSL 3
|
`llama-server` can also be built with SSL support using OpenSSL 3
|
||||||
|
|
||||||
- Using `make`:
|
- Using `make`:
|
||||||
|
|
||||||
|
@ -107,14 +107,14 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
# NOTE: For non-system openssl, use the following:
|
# NOTE: For non-system openssl, use the following:
|
||||||
# CXXFLAGS="-I /path/to/openssl/include"
|
# CXXFLAGS="-I /path/to/openssl/include"
|
||||||
# LDFLAGS="-L /path/to/openssl/lib"
|
# LDFLAGS="-L /path/to/openssl/lib"
|
||||||
make LLAMA_SERVER_SSL=true server
|
make LLAMA_SERVER_SSL=true llama-server
|
||||||
```
|
```
|
||||||
|
|
||||||
- Using `CMake`:
|
- Using `CMake`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cmake -B build -DLLAMA_SERVER_SSL=ON
|
cmake -B build -DLLAMA_SERVER_SSL=ON
|
||||||
cmake --build build --config Release -t server
|
cmake --build build --config Release -t llama-server
|
||||||
```
|
```
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
|
@ -124,13 +124,13 @@ To get started right away, run the following command, making sure to use the cor
|
||||||
### Unix-based systems (Linux, macOS, etc.)
|
### Unix-based systems (Linux, macOS, etc.)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./server -m models/7B/ggml-model.gguf -c 2048
|
./llama-server -m models/7B/ggml-model.gguf -c 2048
|
||||||
```
|
```
|
||||||
|
|
||||||
### Windows
|
### Windows
|
||||||
|
|
||||||
```powershell
|
```powershell
|
||||||
server.exe -m models\7B\ggml-model.gguf -c 2048
|
llama-server.exe -m models\7B\ggml-model.gguf -c 2048
|
||||||
```
|
```
|
||||||
|
|
||||||
The above command will start a server that by default listens on `127.0.0.1:8080`.
|
The above command will start a server that by default listens on `127.0.0.1:8080`.
|
||||||
|
@ -629,11 +629,11 @@ bash chat.sh
|
||||||
|
|
||||||
### OAI-like API
|
### OAI-like API
|
||||||
|
|
||||||
The HTTP `server` supports an OAI-like API: https://github.com/openai/openai-openapi
|
The HTTP `llama-server` supports an OAI-like API: https://github.com/openai/openai-openapi
|
||||||
|
|
||||||
### API errors
|
### API errors
|
||||||
|
|
||||||
`server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
|
`llama-server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
|
||||||
|
|
||||||
Example of an error:
|
Example of an error:
|
||||||
|
|
||||||
|
|
|
@ -99,7 +99,7 @@ The `bench.py` script does several steps:
|
||||||
It aims to be used in the CI, but you can run it manually:
|
It aims to be used in the CI, but you can run it manually:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/server python bench.py \
|
LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/llama-server python bench.py \
|
||||||
--runner-label local \
|
--runner-label local \
|
||||||
--name local \
|
--name local \
|
||||||
--branch `git rev-parse --abbrev-ref HEAD` \
|
--branch `git rev-parse --abbrev-ref HEAD` \
|
||||||
|
|
|
@ -245,7 +245,7 @@ def start_server(args):
|
||||||
|
|
||||||
def start_server_background(args):
|
def start_server_background(args):
|
||||||
# Start the server
|
# Start the server
|
||||||
server_path = '../../../build/bin/server'
|
server_path = '../../../build/bin/llama-server'
|
||||||
if 'LLAMA_SERVER_BIN_PATH' in os.environ:
|
if 'LLAMA_SERVER_BIN_PATH' in os.environ:
|
||||||
server_path = os.environ['LLAMA_SERVER_BIN_PATH']
|
server_path = os.environ['LLAMA_SERVER_BIN_PATH']
|
||||||
server_args = [
|
server_args = [
|
||||||
|
|
|
@ -44,12 +44,12 @@ http module.
|
||||||
|
|
||||||
### running using examples/server
|
### running using examples/server
|
||||||
|
|
||||||
bin/server -m path/model.gguf --path ../examples/server/public_simplechat [--port PORT]
|
./llama-server -m path/model.gguf --path examples/server/public_simplechat [--port PORT]
|
||||||
|
|
||||||
### running using python3's server module
|
### running using python3's server module
|
||||||
|
|
||||||
first run examples/server
|
first run examples/server
|
||||||
* bin/server -m path/model.gguf
|
* ./llama-server -m path/model.gguf
|
||||||
|
|
||||||
next run this web front end in examples/server/public_simplechat
|
next run this web front end in examples/server/public_simplechat
|
||||||
* cd ../examples/server/public_simplechat
|
* cd ../examples/server/public_simplechat
|
||||||
|
|
|
@ -148,7 +148,7 @@ struct server_slot {
|
||||||
int32_t n_prompt_tokens = 0;
|
int32_t n_prompt_tokens = 0;
|
||||||
int32_t n_prompt_tokens_processed = 0;
|
int32_t n_prompt_tokens_processed = 0;
|
||||||
|
|
||||||
std::string prompt;
|
json prompt; // can be either a string, array of strings or array of token ids
|
||||||
|
|
||||||
// when a task is submitted, we first tokenize the prompt and store it here
|
// when a task is submitted, we first tokenize the prompt and store it here
|
||||||
std::vector<llama_token> prompt_tokens;
|
std::vector<llama_token> prompt_tokens;
|
||||||
|
@ -823,8 +823,13 @@ struct server_context {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// skip the slot if it does not contains prompt
|
||||||
|
if (!slot.prompt.is_string()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// current slot's prompt
|
// current slot's prompt
|
||||||
std::string slot_prompt = slot.prompt;
|
std::string slot_prompt = slot.prompt.get<std::string>();
|
||||||
|
|
||||||
// length of the current slot's prompt
|
// length of the current slot's prompt
|
||||||
int slot_prompt_len = slot_prompt.size();
|
int slot_prompt_len = slot_prompt.size();
|
||||||
|
@ -958,12 +963,12 @@ struct server_context {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (prompt->is_string()) {
|
if ((prompt->is_string()) ||
|
||||||
slot.prompt = prompt->get<std::string>();
|
(prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_string()) ||
|
||||||
} else if (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_string()) {
|
(prompt->is_array() && !prompt->empty() && prompt->at(0).is_number_integer())) {
|
||||||
slot.prompt = prompt->at(0).get<std::string>();
|
slot.prompt = *prompt;
|
||||||
} else {
|
} else {
|
||||||
send_error(task, "\"prompt\" must be a string or an array of strings", ERROR_TYPE_INVALID_REQUEST);
|
send_error(task, "\"prompt\" must be a string or an array of integers", ERROR_TYPE_INVALID_REQUEST);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,10 +27,8 @@ To mitigate it, you can increase values in `n_predict`, `kv_size`.
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
cd ../../..
|
cd ../../..
|
||||||
mkdir build
|
cmake -B build -DLLAMA_CURL=ON
|
||||||
cd build
|
cmake --build build --target llama-server
|
||||||
cmake -DLLAMA_CURL=ON ../
|
|
||||||
cmake --build . --target server
|
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Start the test: `./tests.sh`
|
2. Start the test: `./tests.sh`
|
||||||
|
@ -40,7 +38,7 @@ It's possible to override some scenario steps values with environment variables:
|
||||||
| variable | description |
|
| variable | description |
|
||||||
|--------------------------|------------------------------------------------------------------------------------------------|
|
|--------------------------|------------------------------------------------------------------------------------------------|
|
||||||
| `PORT` | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
|
| `PORT` | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
|
||||||
| `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/server` |
|
| `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/llama-server` |
|
||||||
| `DEBUG` | "ON" to enable steps and server verbose mode `--verbose` |
|
| `DEBUG` | "ON" to enable steps and server verbose mode `--verbose` |
|
||||||
| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format |
|
| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format |
|
||||||
| `N_GPU_LAYERS` | number of model layers to offload to VRAM `-ngl --n-gpu-layers` |
|
| `N_GPU_LAYERS` | number of model layers to offload to VRAM `-ngl --n-gpu-layers` |
|
||||||
|
|
|
@ -1272,9 +1272,9 @@ def context_text(context):
|
||||||
|
|
||||||
def start_server_background(context):
|
def start_server_background(context):
|
||||||
if os.name == 'nt':
|
if os.name == 'nt':
|
||||||
context.server_path = '../../../build/bin/Release/server.exe'
|
context.server_path = '../../../build/bin/Release/llama-server.exe'
|
||||||
else:
|
else:
|
||||||
context.server_path = '../../../build/bin/server'
|
context.server_path = '../../../build/bin/llama-server'
|
||||||
if 'LLAMA_SERVER_BIN_PATH' in os.environ:
|
if 'LLAMA_SERVER_BIN_PATH' in os.environ:
|
||||||
context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
|
context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
|
||||||
server_listen_addr = context.server_fqdn
|
server_listen_addr = context.server_fqdn
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET simple)
|
set(TARGET llama-simple)
|
||||||
add_executable(${TARGET} simple.cpp)
|
add_executable(${TARGET} simple.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET speculative)
|
set(TARGET llama-speculative)
|
||||||
add_executable(${TARGET} speculative.cpp)
|
add_executable(${TARGET} speculative.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
# Copyright (C) 2024 Intel Corporation
|
# Copyright (C) 2024 Intel Corporation
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
|
|
||||||
set(TARGET ls-sycl-device)
|
set(TARGET llama-ls-sycl-device)
|
||||||
add_executable(${TARGET} ls-sycl-device.cpp)
|
add_executable(${TARGET} ls-sycl-device.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -6,9 +6,9 @@ This example program provides the tools for llama.cpp for SYCL on Intel GPU.
|
||||||
|
|
||||||
|Tool Name| Function|Status|
|
|Tool Name| Function|Status|
|
||||||
|-|-|-|
|
|-|-|-|
|
||||||
|ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
|
|llama-ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
|
||||||
|
|
||||||
### ls-sycl-device
|
### llama-ls-sycl-device
|
||||||
|
|
||||||
List all SYCL devices with ID, compute capability, max work group size, ect.
|
List all SYCL devices with ID, compute capability, max work group size, ect.
|
||||||
|
|
||||||
|
@ -23,7 +23,7 @@ source /opt/intel/oneapi/setvars.sh
|
||||||
3. Execute
|
3. Execute
|
||||||
|
|
||||||
```
|
```
|
||||||
./build/bin/ls-sycl-device
|
./build/bin/llama-ls-sycl-device
|
||||||
```
|
```
|
||||||
|
|
||||||
Check the ID in startup log, like:
|
Check the ID in startup log, like:
|
||||||
|
|
|
@ -23,15 +23,15 @@ fi
|
||||||
if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
|
if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
|
||||||
echo "use $GGML_SYCL_DEVICE as main GPU"
|
echo "use $GGML_SYCL_DEVICE as main GPU"
|
||||||
#use signle GPU only
|
#use signle GPU only
|
||||||
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||||
else
|
else
|
||||||
#use multiple GPUs with same max compute units
|
#use multiple GPUs with same max compute units
|
||||||
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
#use main GPU only
|
#use main GPU only
|
||||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||||
|
|
||||||
#use multiple GPUs with same max compute units
|
#use multiple GPUs with same max compute units
|
||||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET tokenize)
|
set(TARGET llama-tokenize)
|
||||||
add_executable(${TARGET} tokenize.cpp)
|
add_executable(${TARGET} tokenize.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
set(TARGET train-text-from-scratch)
|
set(TARGET llama-train-text-from-scratch)
|
||||||
add_executable(${TARGET} train-text-from-scratch.cpp)
|
add_executable(${TARGET} train-text-from-scratch.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
|
@ -7,7 +7,7 @@ Basic usage instructions:
|
||||||
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
|
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
|
||||||
|
|
||||||
# train
|
# train
|
||||||
./bin/train-text-from-scratch \
|
./bin/llama-train-text-from-scratch \
|
||||||
--vocab-model ../models/ggml-vocab-llama.gguf \
|
--vocab-model ../models/ggml-vocab-llama.gguf \
|
||||||
--ctx 64 --embd 256 --head 8 --layer 16 \
|
--ctx 64 --embd 256 --head 8 --layer 16 \
|
||||||
--checkpoint-in chk-shakespeare-256x16-LATEST.gguf \
|
--checkpoint-in chk-shakespeare-256x16-LATEST.gguf \
|
||||||
|
@ -18,7 +18,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
|
||||||
--no-checkpointing
|
--no-checkpointing
|
||||||
|
|
||||||
# predict
|
# predict
|
||||||
./bin/main -m ggml-shakespeare-256x16-f32.gguf
|
./bin/llama-cli -m ggml-shakespeare-256x16-f32.gguf
|
||||||
```
|
```
|
||||||
|
|
||||||
Output files will be saved every N iterations (config with `--save-every N`).
|
Output files will be saved every N iterations (config with `--save-every N`).
|
||||||
|
|
|
@ -2746,7 +2746,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||||
case GGML_UNARY_OP_HARDSWISH:
|
case GGML_UNARY_OP_HARDSWISH:
|
||||||
case GGML_UNARY_OP_GELU_QUICK:
|
case GGML_UNARY_OP_GELU_QUICK:
|
||||||
case GGML_UNARY_OP_TANH:
|
case GGML_UNARY_OP_TANH:
|
||||||
return true;
|
return ggml_is_contiguous(op->src[0]);
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
@ -149,7 +149,7 @@ static __global__ void flash_attn_vec_ext_f32(
|
||||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||||
const int i = i0 + threadIdx.x;
|
const int i = i0 + threadIdx.x;
|
||||||
|
|
||||||
Q_f2[j][i0/WARP_SIZE] = ncols <= 2 || ic0 + j ? Q_f2_j[i] : make_float2(0.0f, 0.0f);
|
Q_f2[j][i0/WARP_SIZE] = ncols <= 2 || ic0 + j < ne01 ? Q_f2_j[i] : make_float2(0.0f, 0.0f);
|
||||||
Q_f2[j][i0/WARP_SIZE].x *= scale;
|
Q_f2[j][i0/WARP_SIZE].x *= scale;
|
||||||
Q_f2[j][i0/WARP_SIZE].y *= scale;
|
Q_f2[j][i0/WARP_SIZE].y *= scale;
|
||||||
}
|
}
|
||||||
|
|
|
@ -148,6 +148,8 @@ void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
float * dst_d = (float *)dst->data;
|
float * dst_d = (float *)dst->data;
|
||||||
cudaStream_t stream = ctx.stream();
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
@ -160,6 +162,8 @@ void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
float * dst_d = (float *)dst->data;
|
float * dst_d = (float *)dst->data;
|
||||||
cudaStream_t stream = ctx.stream();
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
@ -172,6 +176,8 @@ void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
|
||||||
float * dst_d = (float *)dst->data;
|
float * dst_d = (float *)dst->data;
|
||||||
cudaStream_t stream = ctx.stream();
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
@ -184,6 +190,8 @@ void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
float * dst_d = (float *)dst->data;
|
float * dst_d = (float *)dst->data;
|
||||||
cudaStream_t stream = ctx.stream();
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
@ -196,6 +204,8 @@ void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
float * dst_d = (float *)dst->data;
|
float * dst_d = (float *)dst->data;
|
||||||
cudaStream_t stream = ctx.stream();
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
@ -208,6 +218,8 @@ void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
float * dst_d = (float *)dst->data;
|
float * dst_d = (float *)dst->data;
|
||||||
cudaStream_t stream = ctx.stream();
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
@ -220,6 +232,8 @@ void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||||
float * dst_d = (float *)dst->data;
|
float * dst_d = (float *)dst->data;
|
||||||
cudaStream_t stream = ctx.stream();
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
@ -232,6 +246,8 @@ void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
|
||||||
float * dst_d = (float *)dst->data;
|
float * dst_d = (float *)dst->data;
|
||||||
cudaStream_t stream = ctx.stream();
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
@ -244,6 +260,8 @@ void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
|
||||||
float * dst_d = (float *)dst->data;
|
float * dst_d = (float *)dst->data;
|
||||||
cudaStream_t stream = ctx.stream();
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
@ -259,6 +277,8 @@ void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
float * dst_d = (float *)dst->data;
|
float * dst_d = (float *)dst->data;
|
||||||
cudaStream_t stream = ctx.stream();
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
|
|
@ -1340,7 +1340,7 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
|
||||||
case GGML_UNARY_OP_RELU:
|
case GGML_UNARY_OP_RELU:
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
case GGML_UNARY_OP_SILU:
|
case GGML_UNARY_OP_SILU:
|
||||||
return true;
|
return ggml_is_contiguous(op->src[0]);
|
||||||
default:
|
default:
|
||||||
;
|
;
|
||||||
}
|
}
|
||||||
|
|
|
@ -744,7 +744,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
case GGML_UNARY_OP_GELU_QUICK:
|
case GGML_UNARY_OP_GELU_QUICK:
|
||||||
case GGML_UNARY_OP_SILU:
|
case GGML_UNARY_OP_SILU:
|
||||||
return true;
|
return ggml_is_contiguous(op->src[0]);
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
@ -17190,7 +17190,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
||||||
case GGML_UNARY_OP_HARDSWISH:
|
case GGML_UNARY_OP_HARDSWISH:
|
||||||
case GGML_UNARY_OP_GELU_QUICK:
|
case GGML_UNARY_OP_GELU_QUICK:
|
||||||
case GGML_UNARY_OP_TANH:
|
case GGML_UNARY_OP_TANH:
|
||||||
return true;
|
return ggml_is_contiguous(op->src[0]);
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
#include "ggml-vulkan.h"
|
#include "ggml-vulkan.h"
|
||||||
|
#include <vulkan/vulkan_core.h>
|
||||||
#ifdef GGML_VULKAN_RUN_TESTS
|
#ifdef GGML_VULKAN_RUN_TESTS
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
#endif
|
#endif
|
||||||
|
@ -9,12 +9,13 @@
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <limits>
|
|
||||||
#include <tuple>
|
#include <tuple>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
#include <limits>
|
||||||
|
#include <map>
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
@ -1555,8 +1556,10 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||||
vk::PhysicalDeviceProperties2 props2;
|
vk::PhysicalDeviceProperties2 props2;
|
||||||
vk::PhysicalDeviceMaintenance3Properties props3;
|
vk::PhysicalDeviceMaintenance3Properties props3;
|
||||||
vk::PhysicalDeviceSubgroupProperties subgroup_props;
|
vk::PhysicalDeviceSubgroupProperties subgroup_props;
|
||||||
|
vk::PhysicalDeviceDriverProperties driver_props;
|
||||||
props2.pNext = &props3;
|
props2.pNext = &props3;
|
||||||
props3.pNext = &subgroup_props;
|
props3.pNext = &subgroup_props;
|
||||||
|
subgroup_props.pNext = &driver_props;
|
||||||
physical_device.getProperties2(&props2);
|
physical_device.getProperties2(&props2);
|
||||||
|
|
||||||
const size_t subgroup_size = subgroup_props.subgroupSize;
|
const size_t subgroup_size = subgroup_props.subgroupSize;
|
||||||
|
@ -1600,7 +1603,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||||
fp16 = fp16 && vk12_features.shaderFloat16;
|
fp16 = fp16 && vk12_features.shaderFloat16;
|
||||||
|
|
||||||
std::string device_name = props2.properties.deviceName.data();
|
std::string device_name = props2.properties.deviceName.data();
|
||||||
std::cerr << GGML_VK_NAME << idx << ": " << device_name << " | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
|
std::cerr << GGML_VK_NAME << idx << ": " << device_name << " (" << driver_props.driverName << ") | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl;
|
||||||
|
|
||||||
if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
|
if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
|
||||||
std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl;
|
std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl;
|
||||||
|
@ -1696,7 +1699,78 @@ void ggml_vk_instance_init() {
|
||||||
vk::PhysicalDeviceProperties props = devices[i].getProperties();
|
vk::PhysicalDeviceProperties props = devices[i].getProperties();
|
||||||
|
|
||||||
if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
|
if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
|
||||||
|
// Check if there are two physical devices corresponding to the same GPU
|
||||||
|
auto old_device = std::find_if(
|
||||||
|
vk_instance.device_indices.begin(),
|
||||||
|
vk_instance.device_indices.end(),
|
||||||
|
[&devices, &props](const size_t k){ return devices[k].getProperties().deviceID == props.deviceID; }
|
||||||
|
);
|
||||||
|
if (old_device == vk_instance.device_indices.end()) {
|
||||||
vk_instance.device_indices.push_back(i);
|
vk_instance.device_indices.push_back(i);
|
||||||
|
} else {
|
||||||
|
// There can be two physical devices corresponding to the same GPU if there are 2 different drivers
|
||||||
|
// This can cause error when splitting layers aross the devices, need to keep only 1
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << "Device " << i << " and device " << *old_device << " have the same device id" << std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
vk::PhysicalDeviceProperties2 old_prop;
|
||||||
|
vk::PhysicalDeviceDriverProperties old_driver;
|
||||||
|
old_prop.pNext = &old_driver;
|
||||||
|
devices[*old_device].getProperties2(&old_prop);
|
||||||
|
|
||||||
|
vk::PhysicalDeviceProperties2 new_prop;
|
||||||
|
vk::PhysicalDeviceDriverProperties new_driver;
|
||||||
|
new_prop.pNext = &new_driver;
|
||||||
|
devices[i].getProperties2(&new_prop);
|
||||||
|
|
||||||
|
std::map<vk::DriverId, int> driver_priorities {};
|
||||||
|
int old_priority = std::numeric_limits<int>::max();
|
||||||
|
int new_priority = std::numeric_limits<int>::max();
|
||||||
|
|
||||||
|
// Check https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkDriverId.html for the list of driver id
|
||||||
|
// Smaller number -> higher priority
|
||||||
|
switch (old_prop.properties.vendorID) {
|
||||||
|
case VK_VENDOR_ID_AMD:
|
||||||
|
driver_priorities[vk::DriverId::eMesaRadv] = 1;
|
||||||
|
driver_priorities[vk::DriverId::eAmdOpenSource] = 2;
|
||||||
|
driver_priorities[vk::DriverId::eAmdProprietary] = 3;
|
||||||
|
break;
|
||||||
|
case VK_VENDOR_ID_INTEL:
|
||||||
|
driver_priorities[vk::DriverId::eIntelOpenSourceMESA] = 1;
|
||||||
|
driver_priorities[vk::DriverId::eIntelProprietaryWindows] = 2;
|
||||||
|
break;
|
||||||
|
case VK_VENDOR_ID_NVIDIA:
|
||||||
|
driver_priorities[vk::DriverId::eNvidiaProprietary] = 1;
|
||||||
|
#if defined(VK_API_VERSION_1_3) && VK_HEADER_VERSION >= 235
|
||||||
|
driver_priorities[vk::DriverId::eMesaNvk] = 2;
|
||||||
|
#endif
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (driver_priorities.count(old_driver.driverID)) {
|
||||||
|
old_priority = driver_priorities[old_driver.driverID];
|
||||||
|
}
|
||||||
|
if (driver_priorities.count(new_driver.driverID)) {
|
||||||
|
new_priority = driver_priorities[new_driver.driverID];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (new_priority < old_priority) {
|
||||||
|
auto r = std::remove(vk_instance.device_indices.begin(), vk_instance.device_indices.end(), *old_device);
|
||||||
|
vk_instance.device_indices.erase(r, vk_instance.device_indices.end());
|
||||||
|
vk_instance.device_indices.push_back(i);
|
||||||
|
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
std::cerr << "Prioritize device " << i << " driver " << new_driver.driverName << " over device " << *old_device << " driver " << old_driver.driverName << std::endl;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#ifdef GGML_VULKAN_DEBUG
|
||||||
|
else {
|
||||||
|
std::cerr << "Prioritize device " << *old_device << " driver " << old_driver.driverName << " over device " << i << " driver " << new_driver.driverName << std::endl;
|
||||||
|
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6365,7 +6439,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
case GGML_UNARY_OP_SILU:
|
case GGML_UNARY_OP_SILU:
|
||||||
case GGML_UNARY_OP_RELU:
|
case GGML_UNARY_OP_RELU:
|
||||||
return true;
|
return ggml_is_contiguous(op->src[0]);
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
156
ggml.c
156
ggml.c
|
@ -3232,35 +3232,42 @@ GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
||||||
return tensor->nb[0] > tensor->nb[1];
|
return tensor->nb[0] > tensor->nb[1];
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
size_t next_nb = ggml_type_size(tensor->type);
|
||||||
|
if (tensor->ne[0] != ggml_blck_size(tensor->type) && tensor->nb[0] != next_nb) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
next_nb *= tensor->ne[0]/ggml_blck_size(tensor->type);
|
||||||
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||||
|
if (tensor->ne[i] != 1) {
|
||||||
|
if (i > n) {
|
||||||
|
if (tensor->nb[i] != next_nb) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
next_nb *= tensor->ne[i];
|
||||||
|
} else {
|
||||||
|
// this dimension does not need to be contiguous
|
||||||
|
next_nb = tensor->ne[i]*tensor->nb[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
return
|
GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
||||||
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
return ggml_is_contiguous_0(tensor);
|
||||||
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
|
|
||||||
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
|
||||||
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
|
GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
|
||||||
return ggml_is_contiguous(tensor);
|
return ggml_is_contiguous_n(tensor, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
|
GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
return ggml_is_contiguous_n(tensor, 1);
|
||||||
|
|
||||||
return
|
|
||||||
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
|
||||||
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
|
||||||
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
|
GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
return ggml_is_contiguous_n(tensor, 2);
|
||||||
|
|
||||||
return
|
|
||||||
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
|
||||||
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
||||||
|
@ -4103,32 +4110,26 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
|
||||||
switch (tensor->type) {
|
switch (tensor->type) {
|
||||||
case GGML_TYPE_I8:
|
case GGML_TYPE_I8:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
|
|
||||||
return ((int8_t *)(tensor->data))[i];
|
return ((int8_t *)(tensor->data))[i];
|
||||||
}
|
}
|
||||||
case GGML_TYPE_I16:
|
case GGML_TYPE_I16:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
|
|
||||||
return ((int16_t *)(tensor->data))[i];
|
return ((int16_t *)(tensor->data))[i];
|
||||||
}
|
}
|
||||||
case GGML_TYPE_I32:
|
case GGML_TYPE_I32:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
|
|
||||||
return ((int32_t *)(tensor->data))[i];
|
return ((int32_t *)(tensor->data))[i];
|
||||||
}
|
}
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
|
||||||
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
|
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
|
||||||
}
|
}
|
||||||
case GGML_TYPE_BF16:
|
case GGML_TYPE_BF16:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
|
|
||||||
return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
|
return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
|
||||||
}
|
}
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(tensor->nb[0] == sizeof(float));
|
|
||||||
return ((float *)(tensor->data))[i];
|
return ((float *)(tensor->data))[i];
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
|
@ -4150,32 +4151,26 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
|
||||||
switch (tensor->type) {
|
switch (tensor->type) {
|
||||||
case GGML_TYPE_I8:
|
case GGML_TYPE_I8:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
|
|
||||||
((int8_t *)(tensor->data))[i] = value;
|
((int8_t *)(tensor->data))[i] = value;
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_I16:
|
case GGML_TYPE_I16:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
|
|
||||||
((int16_t *)(tensor->data))[i] = value;
|
((int16_t *)(tensor->data))[i] = value;
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_I32:
|
case GGML_TYPE_I32:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
|
|
||||||
((int32_t *)(tensor->data))[i] = value;
|
((int32_t *)(tensor->data))[i] = value;
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
|
||||||
((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
|
((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_BF16:
|
case GGML_TYPE_BF16:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
|
|
||||||
((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
|
((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(tensor->nb[0] == sizeof(float));
|
|
||||||
((float *)(tensor->data))[i] = value;
|
((float *)(tensor->data))[i] = value;
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
|
@ -7368,13 +7363,15 @@ struct ggml_tensor * ggml_add_rel_pos_inplace(
|
||||||
return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
|
return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// gmml_unary
|
// ggml_unary
|
||||||
|
|
||||||
static struct ggml_tensor * ggml_unary_impl(
|
static struct ggml_tensor * ggml_unary_impl(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
enum ggml_unary_op op,
|
enum ggml_unary_op op,
|
||||||
bool inplace) {
|
bool inplace) {
|
||||||
|
GGML_ASSERT(ggml_is_contiguous_1(a));
|
||||||
|
|
||||||
bool is_node = false;
|
bool is_node = false;
|
||||||
|
|
||||||
if (!inplace && (a->grad)) {
|
if (!inplace && (a->grad)) {
|
||||||
|
@ -11061,6 +11058,8 @@ static void ggml_compute_forward_abs_f32(
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
assert(params->ith == 0);
|
assert(params->ith == 0);
|
||||||
|
assert(ggml_is_contiguous_1(src0));
|
||||||
|
assert(ggml_is_contiguous_1(dst));
|
||||||
assert(ggml_are_same_shape(src0, dst));
|
assert(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
|
@ -11070,9 +11069,6 @@ static void ggml_compute_forward_abs_f32(
|
||||||
const int n = ggml_nrows(src0);
|
const int n = ggml_nrows(src0);
|
||||||
const int nc = src0->ne[0];
|
const int nc = src0->ne[0];
|
||||||
|
|
||||||
assert(dst->nb[0] == sizeof(float));
|
|
||||||
assert(src0->nb[0] == sizeof(float));
|
|
||||||
|
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
ggml_vec_abs_f32(nc,
|
ggml_vec_abs_f32(nc,
|
||||||
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||||
|
@ -11107,6 +11103,8 @@ static void ggml_compute_forward_sgn_f32(
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
assert(params->ith == 0);
|
assert(params->ith == 0);
|
||||||
|
assert(ggml_is_contiguous_1(src0));
|
||||||
|
assert(ggml_is_contiguous_1(dst));
|
||||||
assert(ggml_are_same_shape(src0, dst));
|
assert(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
|
@ -11116,9 +11114,6 @@ static void ggml_compute_forward_sgn_f32(
|
||||||
const int n = ggml_nrows(src0);
|
const int n = ggml_nrows(src0);
|
||||||
const int nc = src0->ne[0];
|
const int nc = src0->ne[0];
|
||||||
|
|
||||||
assert(dst->nb[0] == sizeof(float));
|
|
||||||
assert(src0->nb[0] == sizeof(float));
|
|
||||||
|
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
ggml_vec_sgn_f32(nc,
|
ggml_vec_sgn_f32(nc,
|
||||||
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||||
|
@ -11153,6 +11148,8 @@ static void ggml_compute_forward_neg_f32(
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
assert(params->ith == 0);
|
assert(params->ith == 0);
|
||||||
|
assert(ggml_is_contiguous_1(src0));
|
||||||
|
assert(ggml_is_contiguous_1(dst));
|
||||||
assert(ggml_are_same_shape(src0, dst));
|
assert(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
|
@ -11162,9 +11159,6 @@ static void ggml_compute_forward_neg_f32(
|
||||||
const int n = ggml_nrows(src0);
|
const int n = ggml_nrows(src0);
|
||||||
const int nc = src0->ne[0];
|
const int nc = src0->ne[0];
|
||||||
|
|
||||||
assert(dst->nb[0] == sizeof(float));
|
|
||||||
assert(src0->nb[0] == sizeof(float));
|
|
||||||
|
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
ggml_vec_neg_f32(nc,
|
ggml_vec_neg_f32(nc,
|
||||||
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||||
|
@ -11199,6 +11193,8 @@ static void ggml_compute_forward_step_f32(
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
assert(params->ith == 0);
|
assert(params->ith == 0);
|
||||||
|
assert(ggml_is_contiguous_1(src0));
|
||||||
|
assert(ggml_is_contiguous_1(dst));
|
||||||
assert(ggml_are_same_shape(src0, dst));
|
assert(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
|
@ -11208,9 +11204,6 @@ static void ggml_compute_forward_step_f32(
|
||||||
const int n = ggml_nrows(src0);
|
const int n = ggml_nrows(src0);
|
||||||
const int nc = src0->ne[0];
|
const int nc = src0->ne[0];
|
||||||
|
|
||||||
assert(dst->nb[0] == sizeof(float));
|
|
||||||
assert(src0->nb[0] == sizeof(float));
|
|
||||||
|
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
ggml_vec_step_f32(nc,
|
ggml_vec_step_f32(nc,
|
||||||
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||||
|
@ -11245,6 +11238,8 @@ static void ggml_compute_forward_tanh_f32(
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
assert(params->ith == 0);
|
assert(params->ith == 0);
|
||||||
|
assert(ggml_is_contiguous_1(src0));
|
||||||
|
assert(ggml_is_contiguous_1(dst));
|
||||||
assert(ggml_are_same_shape(src0, dst));
|
assert(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
|
@ -11254,9 +11249,6 @@ static void ggml_compute_forward_tanh_f32(
|
||||||
const int n = ggml_nrows(src0);
|
const int n = ggml_nrows(src0);
|
||||||
const int nc = src0->ne[0];
|
const int nc = src0->ne[0];
|
||||||
|
|
||||||
assert(dst->nb[0] == sizeof(float));
|
|
||||||
assert(src0->nb[0] == sizeof(float));
|
|
||||||
|
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
ggml_vec_tanh_f32(nc,
|
ggml_vec_tanh_f32(nc,
|
||||||
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||||
|
@ -11291,6 +11283,8 @@ static void ggml_compute_forward_elu_f32(
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
assert(params->ith == 0);
|
assert(params->ith == 0);
|
||||||
|
assert(ggml_is_contiguous_1(src0));
|
||||||
|
assert(ggml_is_contiguous_1(dst));
|
||||||
assert(ggml_are_same_shape(src0, dst));
|
assert(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
|
@ -11300,9 +11294,6 @@ static void ggml_compute_forward_elu_f32(
|
||||||
const int n = ggml_nrows(src0);
|
const int n = ggml_nrows(src0);
|
||||||
const int nc = src0->ne[0];
|
const int nc = src0->ne[0];
|
||||||
|
|
||||||
assert(dst->nb[0] == sizeof(float));
|
|
||||||
assert(src0->nb[0] == sizeof(float));
|
|
||||||
|
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
ggml_vec_elu_f32(nc,
|
ggml_vec_elu_f32(nc,
|
||||||
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||||
|
@ -11337,6 +11328,8 @@ static void ggml_compute_forward_relu_f32(
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
assert(params->ith == 0);
|
assert(params->ith == 0);
|
||||||
|
assert(ggml_is_contiguous_1(src0));
|
||||||
|
assert(ggml_is_contiguous_1(dst));
|
||||||
assert(ggml_are_same_shape(src0, dst));
|
assert(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
|
@ -11346,9 +11339,6 @@ static void ggml_compute_forward_relu_f32(
|
||||||
const int n = ggml_nrows(src0);
|
const int n = ggml_nrows(src0);
|
||||||
const int nc = src0->ne[0];
|
const int nc = src0->ne[0];
|
||||||
|
|
||||||
assert(dst->nb[0] == sizeof(float));
|
|
||||||
assert(src0->nb[0] == sizeof(float));
|
|
||||||
|
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
ggml_vec_relu_f32(nc,
|
ggml_vec_relu_f32(nc,
|
||||||
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||||
|
@ -11383,6 +11373,8 @@ static void ggml_compute_forward_sigmoid_f32(
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
assert(params->ith == 0);
|
assert(params->ith == 0);
|
||||||
|
assert(ggml_is_contiguous_1(src0));
|
||||||
|
assert(ggml_is_contiguous_1(dst));
|
||||||
assert(ggml_are_same_shape(src0, dst));
|
assert(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
|
@ -11392,9 +11384,6 @@ static void ggml_compute_forward_sigmoid_f32(
|
||||||
const int n = ggml_nrows(src0);
|
const int n = ggml_nrows(src0);
|
||||||
const int nc = src0->ne[0];
|
const int nc = src0->ne[0];
|
||||||
|
|
||||||
assert(dst->nb[0] == sizeof(float));
|
|
||||||
assert(src0->nb[0] == sizeof(float));
|
|
||||||
|
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
ggml_vec_sigmoid_f32(nc,
|
ggml_vec_sigmoid_f32(nc,
|
||||||
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||||
|
@ -11428,9 +11417,9 @@ static void ggml_compute_forward_gelu_f32(
|
||||||
|
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
assert(ggml_is_contiguous_1(src0));
|
||||||
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
assert(ggml_is_contiguous_1(dst));
|
||||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
assert(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
return;
|
return;
|
||||||
|
@ -11491,9 +11480,9 @@ static void ggml_compute_forward_gelu_quick_f32(
|
||||||
|
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
assert(ggml_is_contiguous_1(src0));
|
||||||
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
assert(ggml_is_contiguous_1(dst));
|
||||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
assert(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
return;
|
return;
|
||||||
|
@ -11554,9 +11543,9 @@ static void ggml_compute_forward_silu_f32(
|
||||||
|
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
assert(ggml_is_contiguous_1(src0));
|
||||||
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
assert(ggml_is_contiguous_1(dst));
|
||||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
assert(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
return;
|
return;
|
||||||
|
@ -11617,6 +11606,8 @@ static void ggml_compute_forward_leaky_relu_f32(
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
assert(params->ith == 0);
|
assert(params->ith == 0);
|
||||||
|
assert(ggml_is_contiguous_1(src0));
|
||||||
|
assert(ggml_is_contiguous_1(dst));
|
||||||
assert(ggml_are_same_shape(src0, dst));
|
assert(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
|
@ -11666,11 +11657,11 @@ static void ggml_compute_forward_silu_back_f32(
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
const struct ggml_tensor * grad = dst->src[1];
|
const struct ggml_tensor * grad = dst->src[1];
|
||||||
|
|
||||||
GGML_ASSERT(ggml_is_contiguous_1(grad));
|
assert(ggml_is_contiguous_1(grad));
|
||||||
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
assert(ggml_is_contiguous_1(src0));
|
||||||
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
assert(ggml_is_contiguous_1(dst));
|
||||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
assert(ggml_are_same_shape(src0, dst));
|
||||||
GGML_ASSERT(ggml_are_same_shape(src0, grad));
|
assert(ggml_are_same_shape(src0, grad));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
return;
|
return;
|
||||||
|
@ -11732,6 +11723,8 @@ static void ggml_compute_forward_hardswish_f32(
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
assert(params->ith == 0);
|
assert(params->ith == 0);
|
||||||
|
assert(ggml_is_contiguous_1(src0));
|
||||||
|
assert(ggml_is_contiguous_1(dst));
|
||||||
assert(ggml_are_same_shape(src0, dst));
|
assert(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
|
@ -11741,9 +11734,6 @@ static void ggml_compute_forward_hardswish_f32(
|
||||||
const int n = ggml_nrows(src0);
|
const int n = ggml_nrows(src0);
|
||||||
const int nc = src0->ne[0];
|
const int nc = src0->ne[0];
|
||||||
|
|
||||||
assert(dst->nb[0] == sizeof(float));
|
|
||||||
assert(src0->nb[0] == sizeof(float));
|
|
||||||
|
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
ggml_vec_hardswish_f32(nc,
|
ggml_vec_hardswish_f32(nc,
|
||||||
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||||
|
@ -11775,6 +11765,8 @@ static void ggml_compute_forward_hardsigmoid_f32(
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
assert(params->ith == 0);
|
assert(params->ith == 0);
|
||||||
|
assert(ggml_is_contiguous_1(src0));
|
||||||
|
assert(ggml_is_contiguous_1(dst));
|
||||||
assert(ggml_are_same_shape(src0, dst));
|
assert(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
|
@ -11784,9 +11776,6 @@ static void ggml_compute_forward_hardsigmoid_f32(
|
||||||
const int n = ggml_nrows(src0);
|
const int n = ggml_nrows(src0);
|
||||||
const int nc = src0->ne[0];
|
const int nc = src0->ne[0];
|
||||||
|
|
||||||
assert(dst->nb[0] == sizeof(float));
|
|
||||||
assert(src0->nb[0] == sizeof(float));
|
|
||||||
|
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
ggml_vec_hardsigmoid_f32(nc,
|
ggml_vec_hardsigmoid_f32(nc,
|
||||||
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||||
|
@ -16744,7 +16733,10 @@ static void ggml_compute_forward_map_unary_f32(
|
||||||
|
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
assert(params->ith == 0);
|
||||||
|
assert(ggml_is_contiguous_1(src0));
|
||||||
|
assert(ggml_is_contiguous_1(dst));
|
||||||
|
assert(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
return;
|
return;
|
||||||
|
@ -16753,9 +16745,6 @@ static void ggml_compute_forward_map_unary_f32(
|
||||||
const int n = ggml_nrows(src0);
|
const int n = ggml_nrows(src0);
|
||||||
const int nc = src0->ne[0];
|
const int nc = src0->ne[0];
|
||||||
|
|
||||||
assert( dst->nb[0] == sizeof(float));
|
|
||||||
assert(src0->nb[0] == sizeof(float));
|
|
||||||
|
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
fun(nc,
|
fun(nc,
|
||||||
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||||
|
@ -16793,6 +16782,9 @@ static void ggml_compute_forward_map_binary_f32(
|
||||||
const struct ggml_tensor * src1 = dst->src[1];
|
const struct ggml_tensor * src1 = dst->src[1];
|
||||||
|
|
||||||
assert(params->ith == 0);
|
assert(params->ith == 0);
|
||||||
|
assert(ggml_is_contiguous_1(src0));
|
||||||
|
assert(ggml_is_contiguous_1(src1));
|
||||||
|
assert(ggml_is_contiguous_1(dst));
|
||||||
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
|
@ -16802,10 +16794,6 @@ static void ggml_compute_forward_map_binary_f32(
|
||||||
const int n = ggml_nrows(src0);
|
const int n = ggml_nrows(src0);
|
||||||
const int nc = src0->ne[0];
|
const int nc = src0->ne[0];
|
||||||
|
|
||||||
assert( dst->nb[0] == sizeof(float));
|
|
||||||
assert(src0->nb[0] == sizeof(float));
|
|
||||||
assert(src1->nb[0] == sizeof(float));
|
|
||||||
|
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
fun(nc,
|
fun(nc,
|
||||||
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue