@@ -264,9 +268,13 @@
import { createApp, defineComponent, shallowRef, computed, h } from './deps_vue.esm-browser.js';
import { llama } from './completion.js';
+ // utility functions
const isString = (x) => !!x.toLowerCase;
const isNumeric = (n) => !isString(n) && !isNaN(n);
+ const escapeAttr = (str) => str.replace(/>/g, '>').replace(/"/g, '"');
+ const copyStr = (str) => navigator.clipboard.writeText(str);
+ // constants
const BASE_URL = localStorage.getItem('base') // for debugging
|| (new URL('.', document.baseURI).href).toString(); // for production
const CONFIG_DEFAULT = {
@@ -295,7 +303,7 @@
custom: '', // custom json-stringified object
};
const CONFIG_INFO = {
- apiKey: '',
+ apiKey: 'Set the API Key if you are using --api-key option for the server.',
systemMessage: 'The starting message that defines how model should behave.',
temperature: 'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
dynatemp_range: 'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.',
@@ -325,19 +333,28 @@
// markdown support
const VueMarkdown = defineComponent(
(props) => {
- const md = shallowRef(new markdownit(props.options ?? { breaks: true }));
- for (const plugin of props.plugins ?? []) {
- md.value.use(plugin);
- }
+ const md = shallowRef(new markdownit({ breaks: true }));
+ const origFenchRenderer = md.value.renderer.rules.fence;
+ md.value.renderer.rules.fence = (tokens, idx, ...args) => {
+ const content = tokens[idx].content;
+ const origRendered = origFenchRenderer(tokens, idx, ...args);
+ return `
+
+
+
+ ${origRendered}
+
`;
+ };
+ window.copyStr = copyStr;
const content = computed(() => md.value.render(props.source));
return () => h("div", { innerHTML: content.value });
},
- { props: ["source", "options", "plugins"] }
+ { props: ["source"] }
);
// inout field to be used by settings modal
- const SettingsModalNumericInput = defineComponent({
- template: document.getElementById('settings-modal-numeric-input').innerHTML,
+ const SettingsModalShortInput = defineComponent({
+ template: document.getElementById('settings-modal-short-input').innerHTML,
props: ['configKey', 'configDefault', 'configInfo', 'modelValue'],
});
@@ -390,7 +407,11 @@
if (!conv) return;
const msg = conv.messages.pop();
conv.lastModified = Date.now();
- localStorage.setItem(convId, JSON.stringify(conv));
+ if (conv.messages.length === 0) {
+ StorageUtils.remove(convId);
+ } else {
+ localStorage.setItem(convId, JSON.stringify(conv));
+ }
return msg;
},
@@ -431,7 +452,7 @@
const mainApp = createApp({
components: {
VueMarkdown,
- SettingsModalNumericInput,
+ SettingsModalShortInput,
},
data() {
return {
@@ -587,6 +608,7 @@
this.isGenerating = false;
this.stopGeneration = () => {};
this.fetchMessages();
+ chatScrollToBottom();
},
// message actions
@@ -600,7 +622,7 @@
this.generateMessage(currConvId);
},
copyMsg(msg) {
- navigator.clipboard.writeText(msg.content);
+ copyStr(msg.content);
},
editUserMsgAndRegenerate(msg) {
if (this.isGenerating) return;
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index c408eaaa5..6e81ebb76 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -103,6 +103,12 @@ struct server_task_result {
bool error;
};
+struct server_static_file {
+ const unsigned char * data;
+ unsigned int size;
+ const char * mime_type;
+};
+
struct slot_params {
bool stream = true;
bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
@@ -2260,6 +2266,16 @@ int main(int argc, char ** argv) {
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
LOG_INF("\n");
+ // static files
+ std::map
static_files = {
+ { "/", { index_html, index_html_len, "text/html; charset=utf-8" }},
+ { "/completion.js", { completion_js, completion_js_len, "text/javascript; charset=utf-8" }},
+ { "/deps_daisyui.min.css", { deps_daisyui_min_css, deps_daisyui_min_css_len, "text/css; charset=utf-8" }},
+ { "/deps_markdown-it.js", { deps_markdown_it_js, deps_markdown_it_js_len, "text/javascript; charset=utf-8" }},
+ { "/deps_tailwindcss.js", { deps_tailwindcss_js, deps_tailwindcss_js_len, "text/javascript; charset=utf-8" }},
+ { "/deps_vue.esm-browser.js", { deps_vue_esm_browser_js, deps_vue_esm_browser_js_len, "text/javascript; charset=utf-8" }},
+ };
+
std::unique_ptr svr;
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
@@ -2340,7 +2356,7 @@ int main(int argc, char ** argv) {
// Middlewares
//
- auto middleware_validate_api_key = [¶ms, &res_error](const httplib::Request & req, httplib::Response & res) {
+ auto middleware_validate_api_key = [¶ms, &res_error, &static_files](const httplib::Request & req, httplib::Response & res) {
static const std::unordered_set public_endpoints = {
"/health",
"/models",
@@ -2352,8 +2368,8 @@ int main(int argc, char ** argv) {
return true;
}
- // If path is public, skip validation
- if (public_endpoints.find(req.path) != public_endpoints.end()) {
+ // If path is public or is static file, skip validation
+ if (public_endpoints.find(req.path) != public_endpoints.end() || static_files.find(req.path) != static_files.end()) {
return true;
}
@@ -3097,13 +3113,6 @@ int main(int argc, char ** argv) {
res.status = 200; // HTTP OK
};
- auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) {
- return [content, len, mime_type](const httplib::Request &, httplib::Response & res) {
- res.set_content(reinterpret_cast(content), len, mime_type);
- return false;
- };
- };
-
//
// Router
//
@@ -3118,12 +3127,13 @@ int main(int argc, char ** argv) {
}
} else {
// using embedded static files
- svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
- svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
- svr->Get("/deps_daisyui.min.css", handle_static_file(deps_daisyui_min_css, deps_daisyui_min_css_len, "text/css; charset=utf-8"));
- svr->Get("/deps_markdown-it.js", handle_static_file(deps_markdown_it_js, deps_markdown_it_js_len, "text/javascript; charset=utf-8"));
- svr->Get("/deps_tailwindcss.js", handle_static_file(deps_tailwindcss_js, deps_tailwindcss_js_len, "text/javascript; charset=utf-8"));
- svr->Get("/deps_vue.esm-browser.js", handle_static_file(deps_vue_esm_browser_js, deps_vue_esm_browser_js_len, "text/javascript; charset=utf-8"));
+ for (const auto & it : static_files) {
+ const server_static_file & static_file = it.second;
+ svr->Get(it.first.c_str(), [&static_file](const httplib::Request &, httplib::Response & res) {
+ res.set_content(reinterpret_cast(static_file.data), static_file.size, static_file.mime_type);
+ return false;
+ });
+ }
}
// register API routes
diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
index 4da62cb2b..7571ef979 100644
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -169,6 +169,9 @@ extern "C" {
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
#endif
+ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
+ GGML_BACKEND_API bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft);
+
#ifdef __cplusplus
}
#endif
diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c
index 688b17b45..013912051 100644
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@@ -8,19 +8,42 @@
#define UNUSED GGML_UNUSED
-static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
+static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
block_q4_0x4 out;
for (int i = 0; i < 4; i++) {
out.d[i] = in[i].d;
}
- for (int i = 0; i < QK4_0 * 2; i++) {
- int src_offset = (i / (4 * blck_size_interleave)) * blck_size_interleave;
- int src_id = (i % (4 * blck_size_interleave)) / blck_size_interleave;
- src_offset += (i % blck_size_interleave);
+ const int end = QK4_0 * 2 / blck_size_interleave;
- out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
+ if (blck_size_interleave == 8) {
+ const uint64_t xor_mask = 0x8888888888888888ULL;
+ for (int i = 0; i < end; ++i) {
+ int src_id = i % 4;
+ int src_offset = (i / 4) * blck_size_interleave;
+ int dst_offset = i * blck_size_interleave;
+
+ uint64_t elems;
+ // Using memcpy to avoid unaligned memory accesses
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
+ elems ^= xor_mask;
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
+ }
+ } else if (blck_size_interleave == 4) {
+ const uint32_t xor_mask = 0x88888888;
+ for (int i = 0; i < end; ++i) {
+ int src_id = i % 4;
+ int src_offset = (i / 4) * blck_size_interleave;
+ int dst_offset = i * blck_size_interleave;
+
+ uint32_t elems;
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
+ elems ^= xor_mask;
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
+ }
+ } else {
+ GGML_ASSERT(false);
}
return out;
@@ -30,19 +53,25 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_in
// returns an interleaved block_q4_0x8
// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
-static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
+static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
block_q4_0x8 out;
for (int i = 0; i < 8; i++) {
out.d[i] = in[i].d;
}
- for (int i = 0; i < QK4_0 * 4; i++) {
- int src_offset = (i / (8 * blck_size_interleave)) * blck_size_interleave;
- int src_id = (i % (8 * blck_size_interleave)) / blck_size_interleave;
- src_offset += (i % blck_size_interleave);
+ const int end = QK4_0 * 4 / blck_size_interleave;
+ const uint64_t xor_mask = 0x8888888888888888ULL;
- out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
+ for (int i = 0; i < end; ++i) {
+ int src_id = i % 8;
+ int src_offset = (i / 8) * blck_size_interleave;
+ int dst_offset = i * blck_size_interleave;
+
+ uint64_t elems;
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
+ elems ^= xor_mask;
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
}
return out;
@@ -71,11 +100,11 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds
}
if (nrows_interleaved == 8) {
- *(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave, 0x88);
+ *(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave);
out_ptr = (block_q4_0x8 *) out_ptr + 1;
}
else if (nrows_interleaved == 4) {
- *(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave, 0x88);
+ *(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave);
out_ptr = (block_q4_0x4 *) out_ptr + 1;
}
}
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
index 0ad9fe40a..96a16dfba 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
@@ -3385,3 +3385,176 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
}
}
}
+
+// FIXME: this code is duplicated from ggml-aarch64.c
+static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
+ block_q4_0x4 out;
+
+ for (int i = 0; i < 4; i++) {
+ out.d[i] = in[i].d;
+ }
+
+ const int end = QK4_0 * 2 / blck_size_interleave;
+
+ if (blck_size_interleave == 8) {
+ const uint64_t xor_mask = 0x8888888888888888ULL;
+ for (int i = 0; i < end; ++i) {
+ int src_id = i % 4;
+ int src_offset = (i / 4) * blck_size_interleave;
+ int dst_offset = i * blck_size_interleave;
+
+ uint64_t elems;
+ // Using memcpy to avoid unaligned memory accesses
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
+ elems ^= xor_mask;
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
+ }
+ } else if (blck_size_interleave == 4) {
+ const uint32_t xor_mask = 0x88888888;
+ for (int i = 0; i < end; ++i) {
+ int src_id = i % 4;
+ int src_offset = (i / 4) * blck_size_interleave;
+ int dst_offset = i * blck_size_interleave;
+
+ uint32_t elems;
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
+ elems ^= xor_mask;
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
+ }
+ } else {
+ GGML_ASSERT(false);
+ }
+
+ return out;
+}
+
+// interleave 8 block_q4_0s in blocks of blck_size_interleave
+// returns an interleaved block_q4_0x8
+// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
+// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
+static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
+ block_q4_0x8 out;
+
+ for (int i = 0; i < 8; i++) {
+ out.d[i] = in[i].d;
+ }
+
+ const int end = QK4_0 * 4 / blck_size_interleave;
+ const uint64_t xor_mask = 0x8888888888888888ULL;
+
+ for (int i = 0; i < end; ++i) {
+ int src_id = i % 8;
+ int src_offset = (i / 8) * blck_size_interleave;
+ int dst_offset = i * blck_size_interleave;
+
+ uint64_t elems;
+ memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
+ elems ^= xor_mask;
+ memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
+ }
+
+ return out;
+}
+
+static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * restrict data, size_t data_size) {
+ GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
+ GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
+
+ block_q4_0x4 * dst = (block_q4_0x4 *)t->data;
+ const block_q4_0 * src = (const block_q4_0 *)data;
+ block_q4_0 dst_tmp[4];
+ int nrow = t->ne[1]; // Number of rows
+ int nrows_interleaved = 4;
+ int nblocks = t->ne[0] / QK4_0;
+
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
+
+ if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
+ return -1;
+ }
+
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
+ for (int64_t x = 0; x < nblocks; x++) {
+ for (int i = 0; i < nrows_interleaved; i++) {
+ dst_tmp[i] = src[x + i * nblocks];
+ }
+ *dst++ = make_block_q4_0x4(dst_tmp, interleave_block);
+ }
+ src += nrows_interleaved * nblocks;
+ }
+ return 0;
+
+ GGML_UNUSED(data_size);
+}
+
+static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block, const void * restrict data, size_t data_size) {
+ GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
+ GGML_ASSERT(interleave_block == 8);
+
+ block_q4_0x8 * dst = (block_q4_0x8*)t->data;
+ const block_q4_0 * src = (const block_q4_0*) data;
+ block_q4_0 dst_tmp[8];
+ int nrow = t->ne[1]; // Number of rows
+ int nrows_interleaved = 8;
+ int nblocks = t->ne[0] / QK4_0;
+
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
+
+ if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
+ return -1;
+ }
+
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
+ for (int64_t x = 0; x < nblocks; x++) {
+ for (int i = 0; i < nrows_interleaved; i++ ) {
+ dst_tmp[i] = src[x + i * nblocks];
+ }
+ *dst++ = make_block_q4_0x8(dst_tmp, interleave_block);
+ }
+ src += nrows_interleaved * nblocks;
+ }
+ return 0;
+
+ GGML_UNUSED(data_size);
+}
+
+// Prepare for optimized kernels if applicable
+void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * restrict data, size_t data_size) {
+ if (cur->type == repack_type) {
+ memcpy(cur->data, data, data_size);
+ return;
+ }
+
+ GGML_ASSERT(cur->type == GGML_TYPE_Q4_0);
+
+ switch (repack_type) {
+ case GGML_TYPE_Q4_0_8_8:
+ repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size);
+ break;
+ case GGML_TYPE_Q4_0_4_8:
+ repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size);
+ break;
+ case GGML_TYPE_Q4_0_4_4:
+ repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size);
+ break;
+ default:
+ GGML_ABORT("Unsupported type");
+ }
+}
+
+enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) {
+ if (cur->type == GGML_TYPE_Q4_0) {
+ // TODO: enable for AVX2 - currently disabled due to bad gemv performance
+ if (/* ggml_cpu_has_avx2() || */ (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
+ return GGML_TYPE_Q4_0_8_8;
+ }
+ if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
+ return GGML_TYPE_Q4_0_4_8;
+ }
+ if (ggml_cpu_has_neon()) {
+ return GGML_TYPE_Q4_0_4_4;
+ }
+ }
+
+ return cur->type;
+}
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
index 203802f07..53b30c1dd 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h
@@ -21,6 +21,9 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
+void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size);
+enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur);
+
#ifdef __cplusplus
}
#endif
diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c
index ce648c851..7d9747f37 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
@@ -151,6 +151,28 @@ static inline __m128i packNibbles( __m256i bytes )
#endif
}
#elif defined(__AVX__)
+static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
+{
+ // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+ const __m128i lowByte = _mm_set1_epi16( 0xFF );
+ __m128i high = _mm_andnot_si128( lowByte, bytes1 );
+ __m128i low = _mm_and_si128( lowByte, bytes1 );
+ high = _mm_srli_epi16( high, 4 );
+ bytes1 = _mm_or_si128( low, high );
+ high = _mm_andnot_si128( lowByte, bytes2 );
+ low = _mm_and_si128( lowByte, bytes2 );
+ high = _mm_srli_epi16( high, 4 );
+ bytes2 = _mm_or_si128( low, high );
+
+ return _mm_packus_epi16( bytes1, bytes2);
+}
+
+static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
+ const __m128i ax = _mm_sign_epi8(x, x);
+ const __m128i sy = _mm_sign_epi8(y, x);
+ return _mm_maddubs_epi16(ax, sy);
+}
+
// spread 32 bits to 32 bytes { 0x00, 0xFF }
static inline __m256i bytes_from_bits_32(const uint8_t * x) {
uint32_t x32;
@@ -218,26 +240,29 @@ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
return sum_i16_pairs_float(doth, dotl);
}
-static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
-{
- // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
- const __m128i lowByte = _mm_set1_epi16( 0xFF );
- __m128i high = _mm_andnot_si128( lowByte, bytes1 );
- __m128i low = _mm_and_si128( lowByte, bytes1 );
- high = _mm_srli_epi16( high, 4 );
- bytes1 = _mm_or_si128( low, high );
- high = _mm_andnot_si128( lowByte, bytes2 );
- low = _mm_and_si128( lowByte, bytes2 );
- high = _mm_srli_epi16( high, 4 );
- bytes2 = _mm_or_si128( low, high );
+// larger version of mul_sum_i8_pairs_float where x and y are each represented by four 128-bit vectors
+static inline __m256 mul_sum_i8_quad_float(const __m128i x_1_0, const __m128i x_1_1, const __m128i x_2_0, const __m128i x_2_1,
+ const __m128i y_1_0, const __m128i y_1_1, const __m128i y_2_0, const __m128i y_2_1) {
+ const __m128i mone = _mm_set1_epi16(1);
- return _mm_packus_epi16( bytes1, bytes2);
+ const __m128i p16_1_0 = mul_add_epi8_sse(x_1_0, y_1_0);
+ const __m128i p16_1_1 = mul_add_epi8_sse(x_1_1, y_1_1);
+ const __m128i p16_2_0 = mul_add_epi8_sse(x_2_0, y_2_0);
+ const __m128i p16_2_1 = mul_add_epi8_sse(x_2_1, y_2_1);
+ const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
+ const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
+ const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
+ const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
+ const __m128i p_1 = _mm_add_epi32(p_1_0, p_1_1);
+ const __m128i p_2 = _mm_add_epi32(p_2_0, p_2_1);
+ return _mm256_cvtepi32_ps(MM256_SET_M128I(p_2, p_1));
}
-static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
- const __m128i ax = _mm_sign_epi8(x, x);
- const __m128i sy = _mm_sign_epi8(y, x);
- return _mm_maddubs_epi16(ax, sy);
+// quad fp16 delta calculation
+static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const float x1, const float y1) {
+ // GGML_FP16_TO_FP32 is faster than Intel F16C
+ return _mm256_set_m128(_mm_set1_ps(GGML_FP16_TO_FP32(x1) * GGML_FP16_TO_FP32(y1)),
+ _mm_set1_ps(GGML_FP16_TO_FP32(x0) * GGML_FP16_TO_FP32(y0)));
}
#endif
#elif defined(__SSSE3__)
@@ -2005,10 +2030,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
sumf = hsum_float_8(acc);
#elif defined(__AVX__)
- const __m128i mone = _mm_set1_epi16(1);
-
- __m256 accum1 = _mm256_setzero_ps();
- __m256 accum2 = _mm256_setzero_ps();
+ __m256 accum = _mm256_setzero_ps();
for (; ib + 1 < nb; ib += 2) {
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
@@ -2021,21 +2043,20 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8));
const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8));
const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8));
+
const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
- const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
- const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
- const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
- const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
- accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
- _mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
- accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
- _mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
+ const __m128i p_1 = _mm_add_epi16(p16_1_0, p16_1_1);
+ const __m128i p_2 = _mm_add_epi16(p16_2_0, p16_2_1);
+ const __m256 p = sum_i16_pairs_float(p_2, p_1);
+
+ const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
+ accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
}
- sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
+ sumf = hsum_float_8(accum);
#elif defined(__SSSE3__)
// set constants
const __m128i lowMask = _mm_set1_epi8(0xF);
@@ -3536,7 +3557,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
}
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__AVX2__) || defined(__AVX__)
+#elif defined(__AVX2__)
// Initialize accumulator with zeros
__m256 acc = _mm256_setzero_ps();
@@ -3550,14 +3571,29 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
// Multiply q with scale and accumulate
-#if defined(__AVX2__)
acc = _mm256_fmadd_ps( d, q, acc );
-#else
- acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc );
-#endif
}
sumf = hsum_float_8(acc);
+#elif defined(__AVX__)
+ __m256 accum = _mm256_setzero_ps();
+
+ for (; ib + 1 < nb; ib += 2) {
+ const __m128i qx_1_0 = _mm_loadu_si128((const __m128i *)x[ib].qs);
+ const __m128i qx_1_1 = _mm_loadu_si128((const __m128i *)x[ib].qs + 1);
+ const __m128i qx_2_0 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
+ const __m128i qx_2_1 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs + 1);
+ const __m128i qy_1_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
+ const __m128i qy_1_1 = _mm_loadu_si128((const __m128i *)y[ib].qs + 1);
+ const __m128i qy_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
+ const __m128i qy_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
+
+ const __m256 p = mul_sum_i8_quad_float(qx_1_0, qx_1_1, qx_2_0, qx_2_1, qy_1_0, qy_1_1, qy_2_0, qy_2_1);
+ const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
+ accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
+ }
+
+ sumf = hsum_float_8(accum);
#elif defined(__riscv_v_intrinsic)
size_t vl = __riscv_vsetvl_e8m1(qk);
@@ -10323,10 +10359,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
#elif defined __AVX__
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
const __m128i m4b = _mm_set1_epi8(0x0f);
- const __m128i mone = _mm_set1_epi16(1);
- __m256 accum1 = _mm256_setzero_ps();
- __m256 accum2 = _mm256_setzero_ps();
+ __m256 accum = _mm256_setzero_ps();
for (; ib + 1 < nb; ib += 2) {
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
@@ -10339,21 +10373,13 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
- const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
- const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
- const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
- const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
- const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
- const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
- const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
- const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
- accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
- _mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
- accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
- _mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
+
+ const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1);
+ const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
+ accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
}
- sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
+ sumf = hsum_float_8(accum);
#elif defined(__POWER9_VECTOR__)
const vector signed char lowMask = vec_splats((signed char)0xF);
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 76a00e39a..fb56cc37f 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1472,8 +1472,12 @@ static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t
sumf += (ggml_float)_mm512_reduce_add_ps(c2);
#undef LOAD
-#elif defined(__AVX2__)
+#elif defined(__AVX2__) || defined(__AVX__)
+#if defined(__AVX2__)
#define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16))
+#else
+#define LOAD(p) _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)), (_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_bsrli_si128(_mm_loadu_si128((const __m128i *)(p)), 8)), 16)), 1))
+#endif
__m256 c1 = _mm256_setzero_ps();
__m256 c2 = _mm256_setzero_ps();
__m256 c3 = _mm256_setzero_ps();
@@ -7358,6 +7362,7 @@ static void ggml_compute_forward_group_norm(
static void ggml_compute_forward_mul_mat_one_chunk(
const struct ggml_compute_params * params,
struct ggml_tensor * dst,
+ const enum ggml_type type,
const int64_t num_rows_per_vec_dot,
const int64_t ir0_start,
const int64_t ir0_end,
@@ -7369,8 +7374,6 @@ static void ggml_compute_forward_mul_mat_one_chunk(
GGML_TENSOR_BINARY_OP_LOCALS
- const enum ggml_type type = src0->type;
-
const bool src1_cont = ggml_is_contiguous(src1);
ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
@@ -7458,7 +7461,11 @@ static void ggml_compute_forward_mul_mat(
const int ith = params->ith;
const int nth = params->nth;
- const enum ggml_type type = src0->type;
+ enum ggml_type type = src0->type;
+
+ if (src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
+ type = (enum ggml_type)(intptr_t)src0->extra;
+ }
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
@@ -7505,15 +7512,15 @@ static void ggml_compute_forward_mul_mat(
if (src1_cont) {
for (int64_t i13 = 0; i13 < ne13; i13++)
for (int64_t i12 = 0; i12 < ne12; i12++)
- if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
+ if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
- nb01/ggml_type_size(src0->type),
+ nb01/ggml_type_size(type),
(const char *)src1->data + i12*nb12 + i13*nb13,
nb11/ggml_type_size(src1->type),
(char *)dst->data + i12*nb2 + i13*nb3,
nb1/ggml_type_size(dst->type),
ith, nth,
- src0->type,
+ type,
src1->type,
dst->type))
goto UseGgmlGemm1;
@@ -7566,15 +7573,15 @@ UseGgmlGemm1:;
for (int64_t i13 = 0; i13 < ne13; i13++)
for (int64_t i12 = 0; i12 < ne12; i12++)
- if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
+ if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
- nb01/ggml_type_size(src0->type),
+ nb01/ggml_type_size(type),
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
row_size/ggml_type_size(vec_dot_type),
(char *)dst->data + i12*nb2 + i13*nb3,
nb1/ggml_type_size(dst->type),
ith, nth,
- src0->type,
+ type,
vec_dot_type,
dst->type))
goto UseGgmlGemm2;
@@ -7659,7 +7666,7 @@ UseGgmlGemm2:;
const int64_t ir1_start = dr1 * ith1;
const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
- ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
+ ggml_compute_forward_mul_mat_one_chunk(params, dst, type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
if (nth >= nchunk0 * nchunk1) {
break;
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index c7216117b..573b7c5b9 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -1,6 +1,7 @@
#include "ggml-backend.h"
#include "ggml-backend-impl.h"
#include "ggml-cpu.h"
+#include "ggml-cpu-aarch64.h"
#include "ggml-impl.h"
#include
#include
@@ -69,15 +70,84 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
}
#endif
-static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
- static ggml_backend_buffer_type_t bufts[] = {
-#ifdef GGML_USE_CPU_HBM
- ggml_backend_cpu_hbm_buffer_type(),
-#endif
- NULL
+// buffer type AARCH64
+
+static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+ tensor->extra = (void *)ggml_aarch64_get_optimal_repack_type(tensor); // NOLINT
+
+ GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+ GGML_ASSERT(offset == 0);
+ GGML_ASSERT(size == ggml_nbytes(tensor));
+
+ enum ggml_type repack_type = (enum ggml_type)(intptr_t)tensor->extra;
+
+ ggml_aarch64_repack_tensor(tensor, repack_type, data, size);
+
+ GGML_UNUSED(buffer);
+}
+
+static const char * ggml_backend_cpu_aarch64_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+ return "CPU_AARCH64";
+
+ GGML_UNUSED(buft);
+}
+
+static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+ auto * buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
+
+ if (buffer == NULL) {
+ return NULL;
+ }
+
+ buffer->buft = buft;
+ buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor;
+ buffer->iface.set_tensor = ggml_backend_cpu_aarch64_buffer_set_tensor;
+
+ return buffer;
+}
+
+ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) {
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_aarch64 = {
+ /* .iface = */ {
+ /* .get_name = */ ggml_backend_cpu_aarch64_buffer_type_get_name,
+ /* .alloc_buffer = */ ggml_backend_cpu_aarch64_buffer_type_alloc_buffer,
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
+ /* .is_host = */ NULL,
+ },
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+ /* .context = */ NULL,
};
- return bufts;
+ return &ggml_backend_cpu_buffer_type_aarch64;
+}
+
+bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft) {
+ return buft == ggml_backend_cpu_aarch64_buffer_type();
+}
+
+static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
+ static std::vector bufts = []() {
+ std::vector bufts;
+
+#ifdef GGML_USE_CPU_HBM
+ bufts.push_back(ggml_backend_cpu_hbm_buffer_type());
+#endif
+
+#ifdef GGML_USE_CPU_AARCH64
+ bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
+#endif
+
+ bufts.push_back(NULL);
+
+ return bufts;
+ }();
+
+ return bufts.data();
GGML_UNUSED(device);
}
@@ -383,6 +453,21 @@ static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_b
}
static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+ const struct ggml_tensor * src0 = op->src[0];
+ const struct ggml_tensor * src1 = op->src[1];
+
+ if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
+ if (op->op != GGML_OP_MUL_MAT || src0->type != GGML_TYPE_Q4_0 || ggml_aarch64_get_optimal_repack_type(src0) == GGML_TYPE_Q4_0) {
+ return false;
+ }
+ }
+
+ for (int i = 1; i < GGML_MAX_SRC; i++) {
+ if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) {
+ return false;
+ }
+ }
+
switch (op->op) {
case GGML_OP_CPY:
return
@@ -391,13 +476,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
op->type != GGML_TYPE_IQ1_S &&
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
case GGML_OP_MUL_MAT:
- return op->src[1]->type == GGML_TYPE_F32;// FIXME || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
+ return src1->type == GGML_TYPE_F32 || src1->type == ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
case GGML_OP_ROPE_BACK:
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
case GGML_OP_IM2COL_BACK:
- return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
+ return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
case GGML_OP_OUT_PROD:
- return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32;
+ return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)) && src1->type == GGML_TYPE_F32;
default:
return true;
}
@@ -406,7 +491,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
}
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
- return ggml_backend_buft_is_host(buft);
+ return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft);
GGML_UNUSED(dev);
}
@@ -566,6 +651,9 @@ static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
};
ggml_backend_reg_t ggml_backend_cpu_reg(void) {
+ // init CPU feature detection
+ ggml_cpu_init();
+
static struct ggml_backend_reg ggml_backend_cpu_reg = {
/* .iface = */ ggml_backend_cpu_reg_i,
/* .context = */ NULL,
diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index fe4a8f744..c2f28bb49 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -15,6 +15,7 @@
#include
#include
+#include
#include
#include