mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge branch 'master' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # flake.lock # flake.nix
This commit is contained in:
commit
9e0dee769b
12 changed files with 230 additions and 621 deletions
|
@ -1,22 +0,0 @@
|
||||||
{
|
|
||||||
perSystem =
|
|
||||||
{ config, lib, ... }:
|
|
||||||
{
|
|
||||||
apps =
|
|
||||||
let
|
|
||||||
inherit (config.packages) default;
|
|
||||||
binaries = [
|
|
||||||
"llama"
|
|
||||||
"llama-embedding"
|
|
||||||
"llama-server"
|
|
||||||
"quantize"
|
|
||||||
"train-text-from-scratch"
|
|
||||||
];
|
|
||||||
mkApp = name: {
|
|
||||||
type = "app";
|
|
||||||
program = "${default}/bin/${name}";
|
|
||||||
};
|
|
||||||
in
|
|
||||||
lib.genAttrs binaries mkApp;
|
|
||||||
};
|
|
||||||
}
|
|
|
@ -1,13 +0,0 @@
|
||||||
{
|
|
||||||
perSystem =
|
|
||||||
{ config, lib, ... }:
|
|
||||||
{
|
|
||||||
devShells =
|
|
||||||
lib.concatMapAttrs
|
|
||||||
(name: package: {
|
|
||||||
${name} = package.passthru.shell;
|
|
||||||
${name + "-extra"} = package.passthru.shell-extra;
|
|
||||||
})
|
|
||||||
config.packages;
|
|
||||||
};
|
|
||||||
}
|
|
|
@ -1,32 +0,0 @@
|
||||||
{ inputs, ... }:
|
|
||||||
{
|
|
||||||
perSystem =
|
|
||||||
{
|
|
||||||
config,
|
|
||||||
system,
|
|
||||||
lib,
|
|
||||||
pkgsCuda,
|
|
||||||
...
|
|
||||||
}:
|
|
||||||
lib.optionalAttrs (system == "aarch64-linux") {
|
|
||||||
packages =
|
|
||||||
let
|
|
||||||
caps.jetson-xavier = "7.2";
|
|
||||||
caps.jetson-orin = "8.7";
|
|
||||||
caps.jetson-nano = "5.3";
|
|
||||||
|
|
||||||
pkgsFor =
|
|
||||||
cap:
|
|
||||||
import inputs.nixpkgs {
|
|
||||||
inherit system;
|
|
||||||
config = {
|
|
||||||
cudaSupport = true;
|
|
||||||
cudaCapabilities = [ cap ];
|
|
||||||
cudaEnableForwardCompat = false;
|
|
||||||
inherit (pkgsCuda.config) allowUnfreePredicate;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
in
|
|
||||||
builtins.mapAttrs (name: cap: ((pkgsFor cap).callPackage ./scope.nix { }).llama-cpp) caps;
|
|
||||||
};
|
|
||||||
}
|
|
|
@ -1,35 +0,0 @@
|
||||||
{ inputs, ... }:
|
|
||||||
{
|
|
||||||
# The _module.args definitions are passed on to modules as arguments. E.g.
|
|
||||||
# the module `{ pkgs ... }: { /* config */ }` implicitly uses
|
|
||||||
# `_module.args.pkgs` (defined in this case by flake-parts).
|
|
||||||
perSystem =
|
|
||||||
{ system, ... }:
|
|
||||||
{
|
|
||||||
_module.args = {
|
|
||||||
pkgsCuda = import inputs.nixpkgs {
|
|
||||||
inherit system;
|
|
||||||
# Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,
|
|
||||||
# and ucx are built with CUDA support)
|
|
||||||
config.cudaSupport = true;
|
|
||||||
config.allowUnfreePredicate =
|
|
||||||
p:
|
|
||||||
builtins.all
|
|
||||||
(
|
|
||||||
license:
|
|
||||||
license.free
|
|
||||||
|| builtins.elem license.shortName [
|
|
||||||
"CUDA EULA"
|
|
||||||
"cuDNN EULA"
|
|
||||||
]
|
|
||||||
)
|
|
||||||
(p.meta.licenses or [ p.meta.license ]);
|
|
||||||
};
|
|
||||||
# Ensure dependencies use ROCm consistently
|
|
||||||
pkgsRocm = import inputs.nixpkgs {
|
|
||||||
inherit system;
|
|
||||||
config.rocmSupport = true;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
};
|
|
||||||
}
|
|
|
@ -1,265 +0,0 @@
|
||||||
{
|
|
||||||
lib,
|
|
||||||
config,
|
|
||||||
stdenv,
|
|
||||||
mkShell,
|
|
||||||
cmake,
|
|
||||||
ninja,
|
|
||||||
pkg-config,
|
|
||||||
git,
|
|
||||||
python3,
|
|
||||||
mpi,
|
|
||||||
openblas, # TODO: Use the generic `blas` so users could switch betwen alternative implementations
|
|
||||||
cudaPackages,
|
|
||||||
darwin,
|
|
||||||
rocmPackages,
|
|
||||||
clblast,
|
|
||||||
useBlas ? builtins.all (x: !x) [
|
|
||||||
useCuda
|
|
||||||
useMetalKit
|
|
||||||
useOpenCL
|
|
||||||
useRocm
|
|
||||||
],
|
|
||||||
useCuda ? config.cudaSupport,
|
|
||||||
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
|
|
||||||
useMpi ? false, # Increases the runtime closure size by ~700M
|
|
||||||
useOpenCL ? false,
|
|
||||||
useRocm ? config.rocmSupport,
|
|
||||||
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
|
||||||
}@inputs:
|
|
||||||
|
|
||||||
let
|
|
||||||
inherit (lib)
|
|
||||||
cmakeBool
|
|
||||||
cmakeFeature
|
|
||||||
optionals
|
|
||||||
strings
|
|
||||||
versionOlder
|
|
||||||
;
|
|
||||||
|
|
||||||
# It's necessary to consistently use backendStdenv when building with CUDA support,
|
|
||||||
# otherwise we get libstdc++ errors downstream.
|
|
||||||
stdenv = throw "Use effectiveStdenv instead";
|
|
||||||
effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv;
|
|
||||||
|
|
||||||
suffices =
|
|
||||||
lib.optionals useBlas [ "BLAS" ]
|
|
||||||
++ lib.optionals useCuda [ "CUDA" ]
|
|
||||||
++ lib.optionals useMetalKit [ "MetalKit" ]
|
|
||||||
++ lib.optionals useMpi [ "MPI" ]
|
|
||||||
++ lib.optionals useOpenCL [ "OpenCL" ]
|
|
||||||
++ lib.optionals useRocm [ "ROCm" ];
|
|
||||||
|
|
||||||
pnameSuffix =
|
|
||||||
strings.optionalString (suffices != [ ])
|
|
||||||
"-${strings.concatMapStringsSep "-" strings.toLower suffices}";
|
|
||||||
descriptionSuffix =
|
|
||||||
strings.optionalString (suffices != [ ])
|
|
||||||
", accelerated with ${strings.concatStringsSep ", " suffices}";
|
|
||||||
|
|
||||||
# TODO: package the Python in this repository in a Nix-like way.
|
|
||||||
# It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
|
|
||||||
# is PEP 517-compatible, and ensure the correct .dist-info is generated.
|
|
||||||
# https://peps.python.org/pep-0517/
|
|
||||||
llama-python = python3.withPackages (
|
|
||||||
ps: [
|
|
||||||
ps.numpy
|
|
||||||
ps.sentencepiece
|
|
||||||
]
|
|
||||||
);
|
|
||||||
|
|
||||||
# TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
|
|
||||||
llama-python-extra = python3.withPackages (
|
|
||||||
ps: [
|
|
||||||
ps.numpy
|
|
||||||
ps.sentencepiece
|
|
||||||
ps.torchWithoutCuda
|
|
||||||
ps.transformers
|
|
||||||
]
|
|
||||||
);
|
|
||||||
|
|
||||||
# apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
|
|
||||||
# separately
|
|
||||||
darwinBuildInputs =
|
|
||||||
with darwin.apple_sdk.frameworks;
|
|
||||||
[
|
|
||||||
Accelerate
|
|
||||||
CoreVideo
|
|
||||||
CoreGraphics
|
|
||||||
]
|
|
||||||
++ optionals useMetalKit [ MetalKit ];
|
|
||||||
|
|
||||||
cudaBuildInputs = with cudaPackages; [
|
|
||||||
cuda_cccl.dev # <nv/target>
|
|
||||||
|
|
||||||
# A temporary hack for reducing the closure size, remove once cudaPackages
|
|
||||||
# have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
|
|
||||||
cuda_cudart.dev
|
|
||||||
cuda_cudart.lib
|
|
||||||
cuda_cudart.static
|
|
||||||
libcublas.dev
|
|
||||||
libcublas.lib
|
|
||||||
libcublas.static
|
|
||||||
];
|
|
||||||
|
|
||||||
rocmBuildInputs = with rocmPackages; [
|
|
||||||
clr
|
|
||||||
hipblas
|
|
||||||
rocblas
|
|
||||||
];
|
|
||||||
in
|
|
||||||
|
|
||||||
effectiveStdenv.mkDerivation (
|
|
||||||
finalAttrs: {
|
|
||||||
pname = "llama-cpp${pnameSuffix}";
|
|
||||||
version = llamaVersion;
|
|
||||||
|
|
||||||
src = lib.cleanSourceWith {
|
|
||||||
filter =
|
|
||||||
name: type:
|
|
||||||
!(builtins.any (_: _) [
|
|
||||||
(lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
|
|
||||||
(name == "README.md") # Ignore *.md changes whe computing outPaths
|
|
||||||
(lib.hasPrefix "." name) # Skip hidden files and directories
|
|
||||||
]);
|
|
||||||
src = lib.cleanSource ../../.;
|
|
||||||
};
|
|
||||||
|
|
||||||
postPatch = ''
|
|
||||||
substituteInPlace ./ggml-metal.m \
|
|
||||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
|
||||||
|
|
||||||
# TODO: Package up each Python script or service appropriately.
|
|
||||||
# If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`,
|
|
||||||
# we could make those *.py into setuptools' entrypoints
|
|
||||||
substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
|
|
||||||
'';
|
|
||||||
|
|
||||||
nativeBuildInputs =
|
|
||||||
[
|
|
||||||
cmake
|
|
||||||
ninja
|
|
||||||
pkg-config
|
|
||||||
git
|
|
||||||
]
|
|
||||||
++ optionals useCuda [
|
|
||||||
cudaPackages.cuda_nvcc
|
|
||||||
|
|
||||||
# TODO: Replace with autoAddDriverRunpath
|
|
||||||
# once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
|
|
||||||
cudaPackages.autoAddOpenGLRunpathHook
|
|
||||||
];
|
|
||||||
|
|
||||||
buildInputs =
|
|
||||||
optionals effectiveStdenv.isDarwin darwinBuildInputs
|
|
||||||
++ optionals useCuda cudaBuildInputs
|
|
||||||
++ optionals useMpi [ mpi ]
|
|
||||||
++ optionals useOpenCL [ clblast ]
|
|
||||||
++ optionals useRocm rocmBuildInputs;
|
|
||||||
|
|
||||||
cmakeFlags =
|
|
||||||
[
|
|
||||||
(cmakeBool "LLAMA_NATIVE" true)
|
|
||||||
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
|
||||||
(cmakeBool "BUILD_SHARED_LIBS" true)
|
|
||||||
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
|
||||||
(cmakeBool "LLAMA_BLAS" useBlas)
|
|
||||||
(cmakeBool "LLAMA_CLBLAST" useOpenCL)
|
|
||||||
(cmakeBool "LLAMA_CUBLAS" useCuda)
|
|
||||||
(cmakeBool "LLAMA_HIPBLAS" useRocm)
|
|
||||||
(cmakeBool "LLAMA_METAL" useMetalKit)
|
|
||||||
(cmakeBool "LLAMA_MPI" useMpi)
|
|
||||||
]
|
|
||||||
++ optionals useCuda [
|
|
||||||
(
|
|
||||||
with cudaPackages.flags;
|
|
||||||
cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
|
|
||||||
builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
]
|
|
||||||
++ optionals useRocm [
|
|
||||||
(cmakeFeature "CMAKE_C_COMPILER" "hipcc")
|
|
||||||
(cmakeFeature "CMAKE_CXX_COMPILER" "hipcc")
|
|
||||||
|
|
||||||
# Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
|
|
||||||
# in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
|
|
||||||
# and select the line that matches the current nixpkgs version of rocBLAS.
|
|
||||||
# Should likely use `rocmPackages.clr.gpuTargets`.
|
|
||||||
"-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
|
|
||||||
]
|
|
||||||
++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ]
|
|
||||||
++ optionals useBlas [ (lib.cmakeFeature "LLAMA_BLAS_VENDOR" "OpenBLAS") ];
|
|
||||||
|
|
||||||
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
|
||||||
# if they haven't been added yet.
|
|
||||||
postInstall = ''
|
|
||||||
mv $out/bin/main $out/bin/llama
|
|
||||||
mv $out/bin/server $out/bin/llama-server
|
|
||||||
mkdir -p $out/include
|
|
||||||
cp $src/llama.h $out/include/
|
|
||||||
'';
|
|
||||||
|
|
||||||
# Define the shells here, but don't add in the inputsFrom to avoid recursion.
|
|
||||||
passthru = {
|
|
||||||
inherit
|
|
||||||
useBlas
|
|
||||||
useCuda
|
|
||||||
useMetalKit
|
|
||||||
useMpi
|
|
||||||
useOpenCL
|
|
||||||
useRocm
|
|
||||||
;
|
|
||||||
|
|
||||||
shell = mkShell {
|
|
||||||
name = "shell-${finalAttrs.finalPackage.name}";
|
|
||||||
description = "contains numpy and sentencepiece";
|
|
||||||
buildInputs = [ llama-python ];
|
|
||||||
inputsFrom = [ finalAttrs.finalPackage ];
|
|
||||||
};
|
|
||||||
|
|
||||||
shell-extra = mkShell {
|
|
||||||
name = "shell-extra-${finalAttrs.finalPackage.name}";
|
|
||||||
description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
|
|
||||||
buildInputs = [ llama-python-extra ];
|
|
||||||
inputsFrom = [ finalAttrs.finalPackage ];
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
meta = {
|
|
||||||
# Configurations we don't want even the CI to evaluate. Results in the
|
|
||||||
# "unsupported platform" messages. This is mostly a no-op, because
|
|
||||||
# cudaPackages would've refused to evaluate anyway.
|
|
||||||
badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
|
|
||||||
|
|
||||||
# Configurations that are known to result in build failures. Can be
|
|
||||||
# overridden by importing Nixpkgs with `allowBroken = true`.
|
|
||||||
broken = (useMetalKit && !effectiveStdenv.isDarwin);
|
|
||||||
|
|
||||||
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
|
|
||||||
homepage = "https://github.com/ggerganov/llama.cpp/";
|
|
||||||
license = lib.licenses.mit;
|
|
||||||
|
|
||||||
# Accommodates `nix run` and `lib.getExe`
|
|
||||||
mainProgram = "llama";
|
|
||||||
|
|
||||||
# These people might respond, on the best effort basis, if you ping them
|
|
||||||
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
|
|
||||||
# Consider adding yourself to this list if you want to ensure this flake
|
|
||||||
# stays maintained and you're willing to invest your time. Do not add
|
|
||||||
# other people without their consent. Consider removing people after
|
|
||||||
# they've been unreachable for long periods of time.
|
|
||||||
|
|
||||||
# Note that lib.maintainers is defined in Nixpkgs, but you may just add
|
|
||||||
# an attrset following the same format as in
|
|
||||||
# https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
|
|
||||||
maintainers = with lib.maintainers; [
|
|
||||||
philiptaron
|
|
||||||
SomeoneSerge
|
|
||||||
];
|
|
||||||
|
|
||||||
# Extend `badPlatforms` instead
|
|
||||||
platforms = lib.platforms.all;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
)
|
|
|
@ -1,12 +0,0 @@
|
||||||
{
|
|
||||||
lib,
|
|
||||||
newScope,
|
|
||||||
llamaVersion ? "0.0.0",
|
|
||||||
}:
|
|
||||||
|
|
||||||
lib.makeScope newScope (
|
|
||||||
self: {
|
|
||||||
inherit llamaVersion;
|
|
||||||
llama-cpp = self.callPackage ./package.nix { };
|
|
||||||
}
|
|
||||||
)
|
|
23
.github/workflows/nix-flakestry.yml
vendored
23
.github/workflows/nix-flakestry.yml
vendored
|
@ -1,23 +0,0 @@
|
||||||
# Make the flake discoverable on https://flakestry.dev
|
|
||||||
name: "Publish a flake to flakestry"
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
tags:
|
|
||||||
- "v?[0-9]+.[0-9]+.[0-9]+"
|
|
||||||
- "v?[0-9]+.[0-9]+"
|
|
||||||
workflow_dispatch:
|
|
||||||
inputs:
|
|
||||||
tag:
|
|
||||||
description: "The existing tag to publish"
|
|
||||||
type: "string"
|
|
||||||
required: true
|
|
||||||
jobs:
|
|
||||||
publish-flake:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
permissions:
|
|
||||||
id-token: "write"
|
|
||||||
contents: "read"
|
|
||||||
steps:
|
|
||||||
- uses: flakestry/flakestry-publish@main
|
|
||||||
with:
|
|
||||||
version: "${{ inputs.tag || github.ref_name }}"
|
|
|
@ -146,6 +146,27 @@ static std::string get_ftype(int ftype) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// image data
|
||||||
|
//
|
||||||
|
|
||||||
|
// RGB uint8 image
|
||||||
|
struct clip_image_u8 {
|
||||||
|
int nx;
|
||||||
|
int ny;
|
||||||
|
|
||||||
|
std::vector<uint8_t> buf;
|
||||||
|
};
|
||||||
|
|
||||||
|
// RGB float32 image (NHWC)
|
||||||
|
// Memory layout: RGBRGBRGB...
|
||||||
|
struct clip_image_f32 {
|
||||||
|
int nx;
|
||||||
|
int ny;
|
||||||
|
|
||||||
|
std::vector<float> buf;
|
||||||
|
};
|
||||||
|
|
||||||
//
|
//
|
||||||
// clip layers
|
// clip layers
|
||||||
//
|
//
|
||||||
|
@ -207,13 +228,18 @@ struct clip_ctx {
|
||||||
bool has_text_encoder = false;
|
bool has_text_encoder = false;
|
||||||
bool has_vision_encoder = false;
|
bool has_vision_encoder = false;
|
||||||
bool has_llava_projector = false;
|
bool has_llava_projector = false;
|
||||||
|
|
||||||
struct clip_vision_model vision_model;
|
struct clip_vision_model vision_model;
|
||||||
|
|
||||||
float image_mean[3];
|
float image_mean[3];
|
||||||
float image_std[3];
|
float image_std[3];
|
||||||
bool use_gelu = false;
|
bool use_gelu = false;
|
||||||
int32_t ftype = 1;
|
int32_t ftype = 1;
|
||||||
struct ggml_context * ctx;
|
|
||||||
struct gguf_context * ctx_gguf;
|
struct gguf_context * ctx_gguf;
|
||||||
|
struct ggml_context * ctx_data;
|
||||||
|
|
||||||
|
std::vector<uint8_t> buf_compute_meta;
|
||||||
|
|
||||||
// memory buffers to evaluate the model
|
// memory buffers to evaluate the model
|
||||||
ggml_backend_buffer_t params_buffer = NULL;
|
ggml_backend_buffer_t params_buffer = NULL;
|
||||||
|
@ -222,7 +248,7 @@ struct clip_ctx {
|
||||||
ggml_allocr * compute_alloc = NULL;
|
ggml_allocr * compute_alloc = NULL;
|
||||||
};
|
};
|
||||||
|
|
||||||
static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_image_f32_batch * imgs) {
|
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
printf("This gguf file seems to have no vision encoder\n");
|
printf("This gguf file seems to have no vision encoder\n");
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
@ -246,9 +272,10 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
||||||
if (ctx->has_llava_projector) {
|
if (ctx->has_llava_projector) {
|
||||||
GGML_ASSERT(batch_size == 1);
|
GGML_ASSERT(batch_size == 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/*.mem_size =*/ GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead(),
|
/*.mem_size =*/ ctx->buf_compute_meta.size(),
|
||||||
/*.mem_buffer =*/ NULL,
|
/*.mem_buffer =*/ ctx->buf_compute_meta.data(),
|
||||||
/*.no_alloc =*/ true,
|
/*.no_alloc =*/ true,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -272,7 +299,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
||||||
for (int k = 0; k < 3; k++) {
|
for (int k = 0; k < 3; k++) {
|
||||||
for (int y = 0; y < ny; y++) {
|
for (int y = 0; y < ny; y++) {
|
||||||
for (int x = 0; x < nx; x++) {
|
for (int x = 0; x < nx; x++) {
|
||||||
data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].data[3 * (y * nx + x) + k];
|
data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -413,7 +440,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
|
||||||
ggml_allocr_alloc(ctx->compute_alloc, patches);
|
ggml_allocr_alloc(ctx->compute_alloc, patches);
|
||||||
if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
|
if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
|
||||||
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
||||||
for (int i = 0; i < num_positions; i++) {
|
for (int i = 0; i < num_patches; i++) {
|
||||||
patches_data[i] = i + 1;
|
patches_data[i] = i + 1;
|
||||||
}
|
}
|
||||||
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
||||||
|
@ -561,8 +588,8 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
/*.no_alloc =*/ true,
|
/*.no_alloc =*/ true,
|
||||||
};
|
};
|
||||||
|
|
||||||
new_clip->ctx = ggml_init(params);
|
new_clip->ctx_data = ggml_init(params);
|
||||||
if (!new_clip->ctx) {
|
if (!new_clip->ctx_data) {
|
||||||
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
||||||
clip_free(new_clip);
|
clip_free(new_clip);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
@ -579,7 +606,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
for (int i = 0; i < n_tensors; ++i) {
|
for (int i = 0; i < n_tensors; ++i) {
|
||||||
const char * name = gguf_get_tensor_name(ctx, i);
|
const char * name = gguf_get_tensor_name(ctx, i);
|
||||||
struct ggml_tensor * t = ggml_get_tensor(meta, name);
|
struct ggml_tensor * t = ggml_get_tensor(meta, name);
|
||||||
struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx, t);
|
struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx_data, t);
|
||||||
ggml_set_name(cur, name);
|
ggml_set_name(cur, name);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -588,7 +615,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
ggml_allocr* alloc = ggml_allocr_new_from_buffer(new_clip->params_buffer);
|
ggml_allocr* alloc = ggml_allocr_new_from_buffer(new_clip->params_buffer);
|
||||||
for (int i = 0; i < n_tensors; ++i) {
|
for (int i = 0; i < n_tensors; ++i) {
|
||||||
const char * name = gguf_get_tensor_name(ctx, i);
|
const char * name = gguf_get_tensor_name(ctx, i);
|
||||||
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx, name);
|
struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
|
||||||
ggml_allocr_alloc(alloc, cur);
|
ggml_allocr_alloc(alloc, cur);
|
||||||
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
|
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
|
||||||
fin.seekg(offset, std::ios::beg);
|
fin.seekg(offset, std::ios::beg);
|
||||||
|
@ -644,35 +671,35 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
printf("v_n_layer %d\n", hparams.n_layer);
|
printf("v_n_layer %d\n", hparams.n_layer);
|
||||||
}
|
}
|
||||||
|
|
||||||
vision_model.patch_embeddings = get_tensor(new_clip->ctx, TN_PATCH_EMBD);
|
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
||||||
vision_model.class_embedding = get_tensor(new_clip->ctx, TN_CLASS_EMBD);
|
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
|
||||||
vision_model.position_embeddings = get_tensor(new_clip->ctx, format(TN_POS_EMBD, "v"));
|
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
||||||
vision_model.pre_ln_w = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "weight"));
|
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
|
||||||
vision_model.pre_ln_b = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "bias"));
|
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
|
||||||
vision_model.mm_0_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "weight"));
|
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
|
||||||
vision_model.mm_0_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "bias"));
|
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
|
||||||
vision_model.mm_2_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "weight"));
|
vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
|
||||||
vision_model.mm_2_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "bias"));
|
vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
|
||||||
|
|
||||||
vision_model.layers.resize(hparams.n_layer);
|
vision_model.layers.resize(hparams.n_layer);
|
||||||
for (int il = 0; il < hparams.n_layer; ++il) {
|
for (int il = 0; il < hparams.n_layer; ++il) {
|
||||||
auto & layer = vision_model.layers[il];
|
auto & layer = vision_model.layers[il];
|
||||||
layer.k_w = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "weight"));
|
layer.k_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "weight"));
|
||||||
layer.q_w = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "weight"));
|
layer.q_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "weight"));
|
||||||
layer.v_w = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "weight"));
|
layer.v_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "weight"));
|
||||||
layer.o_w = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "weight"));
|
layer.o_w = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "weight"));
|
||||||
layer.ln_1_w = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "weight"));
|
layer.ln_1_w = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "weight"));
|
||||||
layer.ln_2_w = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "weight"));
|
layer.ln_2_w = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "weight"));
|
||||||
layer.ff_i_w = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "weight"));
|
layer.ff_i_w = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "weight"));
|
||||||
layer.ff_o_w = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "weight"));
|
layer.ff_o_w = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "weight"));
|
||||||
layer.k_b = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "bias"));
|
layer.k_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_K, "v", il, "bias"));
|
||||||
layer.q_b = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "bias"));
|
layer.q_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q, "v", il, "bias"));
|
||||||
layer.v_b = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "bias"));
|
layer.v_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_V, "v", il, "bias"));
|
||||||
layer.o_b = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "bias"));
|
layer.o_b = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "bias"));
|
||||||
layer.ln_1_b = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "bias"));
|
layer.ln_1_b = get_tensor(new_clip->ctx_data, format(TN_LN_1, "v", il, "bias"));
|
||||||
layer.ln_2_b = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "bias"));
|
layer.ln_2_b = get_tensor(new_clip->ctx_data, format(TN_LN_2, "v", il, "bias"));
|
||||||
layer.ff_i_b = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "bias"));
|
layer.ff_i_b = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN, "v", il, "bias"));
|
||||||
layer.ff_o_b = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "bias"));
|
layer.ff_o_b = get_tensor(new_clip->ctx_data, format(TN_FFN_UP, "v", il, "bias"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -682,6 +709,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
|
|
||||||
// measure mem requirement and allocate
|
// measure mem requirement and allocate
|
||||||
{
|
{
|
||||||
|
new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
|
||||||
new_clip->compute_alloc = ggml_allocr_new_measure_from_backend(new_clip->backend);
|
new_clip->compute_alloc = ggml_allocr_new_measure_from_backend(new_clip->backend);
|
||||||
clip_image_f32_batch batch;
|
clip_image_f32_batch batch;
|
||||||
batch.size = 1;
|
batch.size = 1;
|
||||||
|
@ -697,26 +725,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
return new_clip;
|
return new_clip;
|
||||||
}
|
}
|
||||||
|
|
||||||
clip_image_u8 * make_clip_image_u8() {
|
struct clip_image_u8 * clip_image_u8_init() {
|
||||||
auto img = new clip_image_u8();
|
return new clip_image_u8();
|
||||||
return img;
|
|
||||||
}
|
}
|
||||||
clip_image_f32 * make_clip_image_f32() { return new clip_image_f32(); }
|
|
||||||
|
|
||||||
void clip_image_u8_free(clip_image_u8 * img) { if (img->data) { delete[] img->data; } delete img; }
|
struct clip_image_f32 * clip_image_f32_init() {
|
||||||
void clip_image_f32_free(clip_image_f32 * img) { if (img->data) { delete[] img->data; } delete img; }
|
return new clip_image_f32();
|
||||||
|
}
|
||||||
|
|
||||||
|
void clip_image_u8_free (struct clip_image_u8 * img) { delete img; }
|
||||||
|
void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
|
||||||
|
|
||||||
static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
|
static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
|
||||||
img->nx = nx;
|
img->nx = nx;
|
||||||
img->ny = ny;
|
img->ny = ny;
|
||||||
img->size = nx * ny * 3;
|
img->buf.resize(3 * nx * ny);
|
||||||
img->data = new uint8_t[img->size]();
|
memcpy(img->buf.data(), data, img->buf.size());
|
||||||
memcpy(img->data, data, img->size);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
||||||
int nx, ny, nc;
|
int nx, ny, nc;
|
||||||
auto data = stbi_load(fname, &nx, &ny, &nc, 3);
|
auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
|
||||||
if (!data) {
|
if (!data) {
|
||||||
fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname);
|
fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname);
|
||||||
return false;
|
return false;
|
||||||
|
@ -728,7 +757,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
||||||
|
|
||||||
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
|
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
|
||||||
int nx, ny, nc;
|
int nx, ny, nc;
|
||||||
auto data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
|
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
|
||||||
if (!data) {
|
if (!data) {
|
||||||
fprintf(stderr, "%s: failed to decode image bytes\n", __func__);
|
fprintf(stderr, "%s: failed to decode image bytes\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
|
@ -740,7 +769,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
|
||||||
|
|
||||||
// normalize: x = (x - mean) / std
|
// normalize: x = (x - mean) / std
|
||||||
// TODO: implement bicubic interpolation instead of linear.
|
// TODO: implement bicubic interpolation instead of linear.
|
||||||
bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) {
|
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) {
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
printf("This gguf file seems to have no vision encoder\n");
|
printf("This gguf file seems to have no vision encoder\n");
|
||||||
return false;
|
return false;
|
||||||
|
@ -749,18 +778,17 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
|
||||||
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
|
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
|
||||||
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
|
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
|
||||||
|
|
||||||
clip_image_u8 * temp = make_clip_image_u8(); // we will keep the input image data here temporarily
|
clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily
|
||||||
if (pad2square && img->nx != img->ny) {
|
if (pad2square && img->nx != img->ny) {
|
||||||
int longer_side = std::max(img->nx, img->ny);
|
int longer_side = std::max(img->nx, img->ny);
|
||||||
temp->nx = longer_side;
|
temp->nx = longer_side;
|
||||||
temp->ny = longer_side;
|
temp->ny = longer_side;
|
||||||
temp->size = 3 * longer_side * longer_side;
|
temp->buf.resize(3 * longer_side * longer_side);
|
||||||
temp->data = new uint8_t[temp->size]();
|
const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
|
||||||
uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
|
|
||||||
|
|
||||||
// fill with background color
|
// fill with background color
|
||||||
for (size_t i = 0; i < temp->size; i++) {
|
for (size_t i = 0; i < temp->buf.size(); i++) {
|
||||||
temp->data[i] = bc[i % 3];
|
temp->buf[i] = bc[i % 3];
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy from the input image
|
// copy from the input image
|
||||||
|
@ -768,17 +796,16 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
|
||||||
for (int x = 0; x < img->nx; x++) {
|
for (int x = 0; x < img->nx; x++) {
|
||||||
const int i = 3 * (y * img->nx + x);
|
const int i = 3 * (y * img->nx + x);
|
||||||
const int j = 3 * (y * temp->nx + x);
|
const int j = 3 * (y * temp->nx + x);
|
||||||
temp->data[j] = img->data[i];
|
temp->buf[j] = img->buf[i];
|
||||||
temp->data[j+1] = img->data[i+1];
|
temp->buf[j+1] = img->buf[i+1];
|
||||||
temp->data[j+2] = img->data[i+2];
|
temp->buf[j+2] = img->buf[i+2];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
temp->nx = img->nx;
|
temp->nx = img->nx;
|
||||||
temp->ny = img->ny;
|
temp->ny = img->ny;
|
||||||
temp->size = img->size;
|
temp->buf.resize(img->buf.size());
|
||||||
temp->data = new uint8_t[temp->size]();
|
memcpy(temp->buf.data(), img->buf.data(), temp->buf.size());
|
||||||
memcpy(&temp->data[0], &img->data[0], temp->size); // copy
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const int nx = temp->nx;
|
const int nx = temp->nx;
|
||||||
|
@ -789,8 +816,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
|
||||||
|
|
||||||
res->nx = nx2;
|
res->nx = nx2;
|
||||||
res->ny = ny2;
|
res->ny = ny2;
|
||||||
res->size = 3 * nx2 * ny2;
|
res->buf.resize(3 * nx2 * ny2);
|
||||||
res->data = new float[res->size]();
|
|
||||||
|
|
||||||
const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size;
|
const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size;
|
||||||
|
|
||||||
|
@ -821,10 +847,10 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
|
||||||
const int j10 = 3 * (y1 * nx + x0) + c;
|
const int j10 = 3 * (y1 * nx + x0) + c;
|
||||||
const int j11 = 3 * (y1 * nx + x1) + c;
|
const int j11 = 3 * (y1 * nx + x1) + c;
|
||||||
|
|
||||||
const float v00 = temp->data[j00];
|
const float v00 = temp->buf[j00];
|
||||||
const float v01 = temp->data[j01];
|
const float v01 = temp->buf[j01];
|
||||||
const float v10 = temp->data[j10];
|
const float v10 = temp->buf[j10];
|
||||||
const float v11 = temp->data[j11];
|
const float v11 = temp->buf[j11];
|
||||||
|
|
||||||
const float v0 = v00 * (1.0f - dx) + v01 * dx;
|
const float v0 = v00 * (1.0f - dx) + v01 * dx;
|
||||||
const float v1 = v10 * (1.0f - dx) + v11 * dx;
|
const float v1 = v10 * (1.0f - dx) + v11 * dx;
|
||||||
|
@ -835,7 +861,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
|
||||||
|
|
||||||
const int i = 3 * (y * nx3 + x) + c;
|
const int i = 3 * (y * nx3 + x) + c;
|
||||||
|
|
||||||
res->data[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
|
res->buf[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -845,12 +871,13 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
|
||||||
}
|
}
|
||||||
|
|
||||||
void clip_free(clip_ctx * ctx) {
|
void clip_free(clip_ctx * ctx) {
|
||||||
ggml_free(ctx->ctx);
|
ggml_free(ctx->ctx_data);
|
||||||
gguf_free(ctx->ctx_gguf);
|
gguf_free(ctx->ctx_gguf);
|
||||||
|
|
||||||
delete ctx;
|
delete ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool clip_image_encode(const clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
printf("This gguf file seems to have no vision encoder\n");
|
printf("This gguf file seems to have no vision encoder\n");
|
||||||
return false;
|
return false;
|
||||||
|
@ -862,8 +889,7 @@ bool clip_image_encode(const clip_ctx * ctx, const int n_threads, clip_image_f32
|
||||||
return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
|
return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
|
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
|
||||||
|
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
printf("This gguf file seems to have no vision encoder\n");
|
printf("This gguf file seems to have no vision encoder\n");
|
||||||
return false;
|
return false;
|
||||||
|
@ -926,11 +952,12 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||||
return false;
|
return false;
|
||||||
};
|
};
|
||||||
|
|
||||||
auto ctx_clip = clip_model_load(fname_inp, 2);
|
auto * ctx_clip = clip_model_load(fname_inp, 2);
|
||||||
const auto & ctx_src = ctx_clip->ctx_gguf;
|
|
||||||
const auto & ctx_data = ctx_clip->ctx;
|
|
||||||
|
|
||||||
auto ctx_out = gguf_init_empty();
|
const auto & ctx_src = ctx_clip->ctx_gguf;
|
||||||
|
const auto & ctx_data = ctx_clip->ctx_data;
|
||||||
|
|
||||||
|
auto * ctx_out = gguf_init_empty();
|
||||||
gguf_set_kv(ctx_out, ctx_src);
|
gguf_set_kv(ctx_out, ctx_src);
|
||||||
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
||||||
gguf_set_val_u32(ctx_out, "general.file_type", itype);
|
gguf_set_val_u32(ctx_out, "general.file_type", itype);
|
||||||
|
|
|
@ -35,31 +35,14 @@ struct clip_vision_hparams {
|
||||||
float eps;
|
float eps;
|
||||||
};
|
};
|
||||||
|
|
||||||
/** load mmproj model */
|
CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
|
||||||
CLIP_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
|
|
||||||
/** free mmproj model */
|
|
||||||
CLIP_API void clip_free(struct clip_ctx * ctx);
|
CLIP_API void clip_free(struct clip_ctx * ctx);
|
||||||
|
|
||||||
size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
||||||
int clip_n_patches(const struct clip_ctx * ctx);
|
|
||||||
int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
|
||||||
|
|
||||||
// RGB uint8 image
|
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
|
||||||
struct clip_image_u8 {
|
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
||||||
int nx;
|
|
||||||
int ny;
|
|
||||||
uint8_t * data = NULL;
|
|
||||||
size_t size;
|
|
||||||
};
|
|
||||||
|
|
||||||
// RGB float32 image (NHWC)
|
|
||||||
// Memory layout: RGBRGBRGB...
|
|
||||||
struct clip_image_f32 {
|
|
||||||
int nx;
|
|
||||||
int ny;
|
|
||||||
float * data = NULL;
|
|
||||||
size_t size;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct clip_image_u8_batch {
|
struct clip_image_u8_batch {
|
||||||
struct clip_image_u8 * data;
|
struct clip_image_u8 * data;
|
||||||
|
@ -71,21 +54,22 @@ struct clip_image_f32_batch {
|
||||||
size_t size;
|
size_t size;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct clip_image_u8 * make_clip_image_u8();
|
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
|
||||||
struct clip_image_f32 * make_clip_image_f32();
|
CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
||||||
CLIP_API void clip_image_u8_free(clip_image_u8 * img);
|
|
||||||
CLIP_API void clip_image_f32_free(clip_image_f32 * img);
|
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
|
||||||
|
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
|
||||||
|
|
||||||
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
||||||
|
|
||||||
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
||||||
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
||||||
|
|
||||||
bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
|
CLIP_API bool clip_image_preprocess (struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, bool pad2square);
|
||||||
bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);
|
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
|
||||||
|
CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
|
||||||
|
|
||||||
bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs,
|
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
|
||||||
float * vec);
|
|
||||||
|
|
||||||
bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,7 +10,7 @@
|
||||||
#include "base64.hpp"
|
#include "base64.hpp"
|
||||||
|
|
||||||
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
|
static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
|
||||||
clip_image_f32 * img_res = make_clip_image_f32();
|
clip_image_f32 * img_res = clip_image_f32_init();
|
||||||
if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) {
|
if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) {
|
||||||
fprintf(stderr, "%s: unable to preprocess image\n", __func__);
|
fprintf(stderr, "%s: unable to preprocess image\n", __func__);
|
||||||
clip_image_f32_free(img_res);
|
clip_image_f32_free(img_res);
|
||||||
|
@ -86,7 +86,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
||||||
}
|
}
|
||||||
|
|
||||||
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
|
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
|
||||||
clip_image_u8 * img = make_clip_image_u8();
|
clip_image_u8 * img = clip_image_u8_init();
|
||||||
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
|
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
|
||||||
clip_image_u8_free(img);
|
clip_image_u8_free(img);
|
||||||
fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__);
|
fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__);
|
||||||
|
|
|
@ -83,7 +83,7 @@ static inline bool is_base64(uint8_t c)
|
||||||
return (isalnum(c) || (c == '+') || (c == '/'));
|
return (isalnum(c) || (c == '+') || (c == '/'));
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::vector<uint8_t> base64_decode(std::string const &encoded_string)
|
static std::vector<uint8_t> base64_decode(const std::string & encoded_string)
|
||||||
{
|
{
|
||||||
int i = 0;
|
int i = 0;
|
||||||
int j = 0;
|
int j = 0;
|
||||||
|
@ -213,7 +213,7 @@ struct slot_image
|
||||||
float * image_embedding = nullptr;
|
float * image_embedding = nullptr;
|
||||||
int32_t image_tokens = 0;
|
int32_t image_tokens = 0;
|
||||||
|
|
||||||
clip_image_u8 img_data;
|
clip_image_u8 * img_data;
|
||||||
|
|
||||||
std::string prefix_prompt; // before of this image
|
std::string prefix_prompt; // before of this image
|
||||||
};
|
};
|
||||||
|
@ -438,7 +438,9 @@ struct llama_client_slot
|
||||||
for (slot_image & img : images)
|
for (slot_image & img : images)
|
||||||
{
|
{
|
||||||
free(img.image_embedding);
|
free(img.image_embedding);
|
||||||
delete[] img.img_data.data;
|
if (img.img_data) {
|
||||||
|
clip_image_u8_free(img.img_data);
|
||||||
|
}
|
||||||
img.prefix_prompt = "";
|
img.prefix_prompt = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -852,24 +854,17 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
for (const auto &img : *images_data)
|
for (const auto &img : *images_data)
|
||||||
{
|
{
|
||||||
std::string data_b64 = img["data"].get<std::string>();
|
const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
|
||||||
|
|
||||||
slot_image img_sl;
|
slot_image img_sl;
|
||||||
img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
|
img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
|
||||||
int width, height, channels;
|
img_sl.img_data = clip_image_u8_init();
|
||||||
std::vector<uint8_t> image_buffer = base64_decode(data_b64);
|
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
|
||||||
data_b64.clear();
|
{
|
||||||
auto data = stbi_load_from_memory(image_buffer.data(), image_buffer.size(), &width, &height, &channels, 3);
|
|
||||||
if (!data) {
|
|
||||||
LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
|
LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
LOG_TEE("slot %i - image loaded [id: %i] resolution (%i x %i)\n", slot->id, img_sl.id, width, height);
|
LOG_TEE("slot %i - loaded image\n", slot->id);
|
||||||
img_sl.img_data.nx = width;
|
|
||||||
img_sl.img_data.ny = height;
|
|
||||||
img_sl.img_data.size = width * height * 3;
|
|
||||||
img_sl.img_data.data = new uint8_t[width * height * 3]();
|
|
||||||
memcpy(img_sl.img_data.data, data, width * height * 3);
|
|
||||||
stbi_image_free(data);
|
|
||||||
img_sl.request_encode_image = true;
|
img_sl.request_encode_image = true;
|
||||||
slot->images.push_back(img_sl);
|
slot->images.push_back(img_sl);
|
||||||
}
|
}
|
||||||
|
@ -1144,8 +1139,8 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
clip_image_f32 img_res;
|
clip_image_f32 * img_res = clip_image_f32_init();
|
||||||
if (!clip_image_preprocess(clp_ctx, &img.img_data, &img_res, /*pad2square =*/ true))
|
if (!clip_image_preprocess(clp_ctx, img.img_data, img_res, /*pad2square =*/ true))
|
||||||
{
|
{
|
||||||
LOG_TEE("Error processing the given image");
|
LOG_TEE("Error processing the given image");
|
||||||
clip_free(clp_ctx);
|
clip_free(clp_ctx);
|
||||||
|
@ -1160,11 +1155,12 @@ struct llama_server_context
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id);
|
LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id);
|
||||||
if (!clip_image_encode(clp_ctx, params.n_threads, &img_res, img.image_embedding))
|
if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding))
|
||||||
{
|
{
|
||||||
LOG_TEE("Unable to encode image\n");
|
LOG_TEE("Unable to encode image\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
clip_image_f32_free(img_res);
|
||||||
img.request_encode_image = false;
|
img.request_encode_image = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
118
ggml-quants.c
118
ggml-quants.c
|
@ -412,13 +412,17 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
||||||
|
|
||||||
#if !defined(__ARM_FEATURE_DOTPROD)
|
#if !defined(__ARM_FEATURE_DOTPROD)
|
||||||
|
|
||||||
inline static int32x4_t vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
||||||
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
|
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
|
||||||
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
||||||
|
|
||||||
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
|
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -2483,8 +2487,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
|
||||||
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
||||||
|
|
||||||
// dot product into int32x4_t
|
// dot product into int32x4_t
|
||||||
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
|
const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
|
||||||
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
|
const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
|
||||||
|
|
||||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||||
|
@ -2771,8 +2775,8 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
|
||||||
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
||||||
|
|
||||||
// dot product into int32x4_t
|
// dot product into int32x4_t
|
||||||
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
|
const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
|
||||||
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
|
const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
|
||||||
|
|
||||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
||||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
||||||
|
@ -2938,11 +2942,11 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
|
||||||
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
||||||
|
|
||||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
||||||
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
||||||
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
||||||
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
||||||
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||||
}
|
}
|
||||||
|
|
||||||
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
||||||
|
@ -3230,11 +3234,11 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
|
||||||
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
||||||
|
|
||||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
||||||
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
||||||
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
||||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
||||||
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
||||||
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
||||||
}
|
}
|
||||||
|
|
||||||
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
|
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
|
||||||
|
@ -3485,12 +3489,12 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
|
||||||
const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
|
const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
|
||||||
|
|
||||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
||||||
vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
|
ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
|
||||||
vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||||
|
|
||||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
||||||
vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
|
ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
|
||||||
vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||||
}
|
}
|
||||||
|
|
||||||
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
||||||
|
@ -3600,8 +3604,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
// We use this macro instead of a function call because for some reason
|
// We use this macro instead of a function call because for some reason
|
||||||
// the code runs 2-3% slower, even if the function is declared inline
|
// the code runs 2-3% slower, even if the function is declared inline
|
||||||
#define MULTIPLY_ACCUM_WITH_SCALE(index)\
|
#define MULTIPLY_ACCUM_WITH_SCALE(index)\
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
|
||||||
|
|
||||||
#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
|
#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
|
||||||
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
|
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
|
||||||
|
@ -3975,10 +3979,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3));
|
q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3));
|
||||||
q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3));
|
q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3));
|
||||||
|
|
||||||
isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
|
isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
|
||||||
isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
|
isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
|
||||||
isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
|
isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
|
||||||
isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];
|
isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];
|
||||||
|
|
||||||
sum += d * (isum1 + isum2);
|
sum += d * (isum1 + isum2);
|
||||||
}
|
}
|
||||||
|
@ -4258,10 +4262,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
|
q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
|
||||||
q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
|
q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
|
||||||
|
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
|
||||||
|
|
||||||
scale += 4;
|
scale += 4;
|
||||||
|
|
||||||
|
@ -4275,10 +4279,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
|
q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
|
||||||
q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
|
q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
|
||||||
|
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
|
||||||
|
|
||||||
scale += 4;
|
scale += 4;
|
||||||
|
|
||||||
|
@ -4759,10 +4763,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2]));
|
q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2]));
|
||||||
q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6), q3h.val[3]));
|
q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6), q3h.val[3]));
|
||||||
|
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];
|
||||||
|
|
||||||
sum += d * isum;
|
sum += d * isum;
|
||||||
|
|
||||||
|
@ -5111,14 +5115,14 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
||||||
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
||||||
|
|
||||||
const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
||||||
sumi1 += vaddvq_s32(p1) * scales[2*j+0];
|
sumi1 += vaddvq_s32(p1) * scales[2*j+0];
|
||||||
|
|
||||||
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
||||||
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
||||||
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
||||||
|
|
||||||
const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
||||||
|
|
||||||
sumi2 += vaddvq_s32(p2) * scales[2*j+1];
|
sumi2 += vaddvq_s32(p2) * scales[2*j+1];
|
||||||
}
|
}
|
||||||
|
@ -5451,13 +5455,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
||||||
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
||||||
|
|
||||||
const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
||||||
const int32_t sumi1 = vaddvq_s32(p1) * scales[0];
|
const int32_t sumi1 = vaddvq_s32(p1) * scales[0];
|
||||||
|
|
||||||
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
||||||
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
||||||
|
|
||||||
const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
|
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
|
||||||
const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
|
const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
|
||||||
|
|
||||||
sumf += d * (sumi1 + sumi2);
|
sumf += d * (sumi1 + sumi2);
|
||||||
|
@ -5724,8 +5728,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2]));
|
q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2]));
|
||||||
q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3]));
|
q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3]));
|
||||||
|
|
||||||
sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
|
sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
|
||||||
sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
|
sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
|
||||||
}
|
}
|
||||||
|
|
||||||
sumf += d * sumi - dmin * sumi_mins;
|
sumf += d * sumi - dmin * sumi_mins;
|
||||||
|
@ -6114,10 +6118,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2]));
|
q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2]));
|
||||||
q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3]));
|
q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3]));
|
||||||
|
|
||||||
int32_t sumi1 = sc[0] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
|
int32_t sumi1 = sc[0] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
|
||||||
int32_t sumi2 = sc[1] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
|
int32_t sumi2 = sc[1] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
|
||||||
int32_t sumi3 = sc[2] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
|
int32_t sumi3 = sc[2] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
|
||||||
int32_t sumi4 = sc[3] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));
|
int32_t sumi4 = sc[3] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));
|
||||||
|
|
||||||
sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
|
sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
|
||||||
}
|
}
|
||||||
|
@ -6401,10 +6405,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
|
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
|
||||||
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
|
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
|
||||||
|
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
||||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
||||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
||||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
||||||
|
|
||||||
scale += 4;
|
scale += 4;
|
||||||
|
|
||||||
|
@ -6428,10 +6432,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
|
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
|
||||||
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
|
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
|
||||||
|
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
||||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
||||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
||||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
||||||
scale += 4;
|
scale += 4;
|
||||||
}
|
}
|
||||||
//sum += isum * d_all * y[i].d;
|
//sum += isum * d_all * y[i].d;
|
||||||
|
@ -6818,10 +6822,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s);
|
q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s);
|
||||||
q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s);
|
q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s);
|
||||||
|
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
||||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
||||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
||||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
||||||
|
|
||||||
sum += isum * d_all * y[i].d;
|
sum += isum * d_all * y[i].d;
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue