Merge commit 'ad3a0505e3' into concedo_experimental

# Conflicts: # .github/workflows/build.yml # .github/workflows/close-issue.yml # .github/workflows/code-coverage.yml # .github/workflows/docker.yml # .github/workflows/editorconfig.yml # .github/workflows/nix-ci-aarch64.yml # .github/workflows/nix-ci.yml # .github/workflows/python-check-requirements.yml # .github/workflows/python-lint.yml # .github/workflows/server.yml # .github/workflows/zig-build.yml # .gitignore # CMakeLists.txt # Makefile # README-sycl.md # README.md # build.zig # common/CMakeLists.txt # llama.cpp # tests/CMakeLists.txt # tests/test-backend-ops.cpp
2025-09-15 03:19:41 +00:00 · 2024-04-06 18:32:57 +08:00 · 2024-04-06 18:32:57 +08:00 · 9c0fbf9f73
commit 9c0fbf9f73
parent c348223dff ad3a0505e3
67 changed files with 10861 additions and 4661 deletions
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -2,12 +2,16 @@ set(TARGET server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
 option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
+add_executable(${TARGET}
+    server.cpp
+    utils.hpp
+    httplib.h
+)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
-target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common json-schema-to-grammar ${CMAKE_THREAD_LIBS_INIT})
 if (LLAMA_SERVER_SSL)
    find_package(OpenSSL REQUIRED)
    target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -16,17 +16,20 @@ The project is under active development, and we are [looking for feedback and co

 **Command line options:**

- `--threads N`, `-t N`: Set the number of threads to use during generation.
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
+- `--threads N`, `-t N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching, this parameter is used only if one token is to be processed on CPU backend.
+- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU.
 - `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`)
 - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
+- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (default: unused).
+- `-hfr REPO, --hf-repo REPO`: Hugging Face model repository (default: unused).
+- `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused).
 - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
 - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
 - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
 - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
+- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`.
+- `-ub N`, `--ubatch-size N`: physical maximum batch size. Default: `512`.
 - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
 - `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
 - `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
@ -57,7 +60,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
 - `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
 - `--metrics`: enable prometheus `/metrics` compatible endpoint (default: disabled)
 - `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
- `--log-disable`: Output logs to stdout only, default: enabled.
+- `--log-disable`: Output logs to stdout only, not to `llama.log`. default: enabled.
 - `--log-format FORMAT`: Define the log output to FORMAT: json or text (default: json)

 **If compiled with `LLAMA_SERVER_SSL=ON`**
@ -260,7 +263,7 @@ node index.js

    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.

-    `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
+    `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)

    `cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. (default: false)

@ -357,7 +360,7 @@ Notice that each `probs` is an array of length `n_probs`.
 - `default_generation_settings` - the default generation settings for the `/completion` endpoint, has the same fields as the `generation_settings` response object from the `/completion` endpoint.
 - `total_slots` - the total number of slots for process requests (defined by `--parallel` option)

- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint.
+- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only model with [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, ChatML template will be used.

    *Options:*

--- a/examples/server/chat.mjs
+++ b/examples/server/chat.mjs
@ -26,8 +26,9 @@ const propOrder = grammarJsonSchemaPropOrder

 let grammar = null
 if (grammarJsonSchemaFile) {
-    const schema = JSON.parse(readFileSync(grammarJsonSchemaFile, 'utf-8'))
-    const converter = new SchemaConverter(propOrder)
+    let schema = JSON.parse(readFileSync(grammarJsonSchemaFile, 'utf-8'))
+    const converter = new SchemaConverter({prop_order: propOrder, allow_fetch: true})
+    schema = await converter.resolveRefs(schema, grammarJsonSchemaFile)
    converter.visit(schema, '')
    grammar = converter.formatGrammar()
 }
--- a/examples/server/completion.js.hpp
+++ b/examples/server/completion.js.hpp
@ -483,4 +483,4 @@ unsigned char completion_js[] = {
  0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
  0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x7d, 0x0a
 };
-unsigned int completion_js_len = 5796;
+size_t completion_js_len = 5796;
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/index.js.hpp
+++ b/examples/server/index.js.hpp
--- a/examples/server/json-schema-to-grammar.mjs.hpp
+++ b/examples/server/json-schema-to-grammar.mjs.hpp
--- a/examples/server/json.hpp
+++ b/examples/server/json.hpp
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -630,14 +630,16 @@

      const grammarJsonSchemaPropOrder = signal('')
      const updateGrammarJsonSchemaPropOrder = (el) => grammarJsonSchemaPropOrder.value = el.target.value
-      const convertJSONSchemaGrammar = () => {
+      const convertJSONSchemaGrammar = async () => {
        try {
-          const schema = JSON.parse(params.value.grammar)
-          const converter = new SchemaConverter(
-            grammarJsonSchemaPropOrder.value
+          let schema = JSON.parse(params.value.grammar)
+          const converter = new SchemaConverter({
+            prop_order: grammarJsonSchemaPropOrder.value
              .split(',')
-              .reduce((acc, cur, i) => ({ ...acc, [cur.trim()]: i }), {})
-          )
+              .reduce((acc, cur, i) => ({ ...acc, [cur.trim()]: i }), {}),
+            allow_fetch: true,
+          })
+          schema = await converter.resolveRefs(schema, 'input')
          converter.visit(schema, '')
          params.value = {
            ...params.value,
--- a/examples/server/public/index.js
+++ b/examples/server/public/index.js
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@ -1,112 +1,538 @@
+// WARNING: This file was ported from json-schema-to-grammar.py, please fix bugs / add features there first.
 const SPACE_RULE = '" "?';

 const PRIMITIVE_RULES = {
  boolean: '("true" | "false") space',
  number: '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
  integer: '("-"? ([0-9] | [1-9] [0-9]*)) space',
+  value: 'object | array | string | number | boolean',
+  object: '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space',
+  array: '"[" space ( value ("," space value)* )? "]" space',
+  uuid: '"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space',
  string: ` "\\"" (
        [^"\\\\] |
        "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
      )* "\\"" space`,
  null: '"null" space',
 };
+const OBJECT_RULE_NAMES = ['object', 'array', 'string', 'number', 'boolean', 'null', 'value'];
+
+// TODO: support "uri", "email" string formats
+const DATE_RULES = {
+    'date'   : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )',
+    'time'   : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )',
+    'date-time': 'date "T" time',
+    'date-string': '"\\"" date "\\"" space',
+    'time-string': '"\\"" time "\\"" space',
+    'date-time-string': '"\\"" date-time "\\"" space',
+};
+
+const RESERVED_NAMES = {'root': true, ...PRIMITIVE_RULES, ...DATE_RULES};

 const INVALID_RULE_CHARS_RE = /[^\dA-Za-z-]+/g;
 const GRAMMAR_LITERAL_ESCAPE_RE = /[\n\r"]/g;
-const GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'};
+const GRAMMAR_RANGE_LITERAL_ESCAPE_RE = /[\n\r"\]\-\\]/g;
+const GRAMMAR_LITERAL_ESCAPES = { '\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]' };
+
+const NON_LITERAL_SET = new Set('|.()[]{}*+?');
+const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('[]()|{}*+?');

 export class SchemaConverter {
-  constructor(propOrder) {
-    this._propOrder = propOrder || {};
-    this._rules = new Map();
-    this._rules.set('space', SPACE_RULE);
+  constructor(options) {
+    this._propOrder = options.prop_order || {};
+    this._allowFetch = options.allow_fetch || false;
+    this._dotall = options.dotall || false;
+    this._rules = {'space': SPACE_RULE};
+    this._refs = {};
+    this._refsBeingResolved = new Set();
  }

  _formatLiteral(literal) {
-    const escaped = JSON.stringify(literal).replace(
+    const escaped = literal.replace(
      GRAMMAR_LITERAL_ESCAPE_RE,
      m => GRAMMAR_LITERAL_ESCAPES[m]
    );
    return `"${escaped}"`;
  }

+  _formatRangeChar(literal) {
+    return JSON.stringify(literal).slice(1, -1).replace(
+      GRAMMAR_RANGE_LITERAL_ESCAPE_RE,
+      m => GRAMMAR_LITERAL_ESCAPES[m]
+    );
+  }
+
  _addRule(name, rule) {
    let escName = name.replace(INVALID_RULE_CHARS_RE, '-');
    let key = escName;

-    if (this._rules.has(escName)) {
-      if (this._rules.get(escName) === rule) {
+    if (escName in this._rules) {
+      if (this._rules[escName] === rule) {
        return key;
      }

      let i = 0;
-      while (this._rules.has(`${escName}${i}`)) {
+      while ((`${escName}${i}` in this._rules) && (this._rules[`${escName}${i}`] !== rule)) {
        i += 1;
      }
      key = `${escName}${i}`;
    }

-    this._rules.set(key, rule);
+    this._rules[key] = rule;
    return key;
  }

+  async resolveRefs(schema, url) {
+    const visit = async (n) => {
+      if (Array.isArray(n)) {
+        return Promise.all(n.map(visit));
+      } else if (typeof n === 'object' && n !== null) {
+        let ref = n.$ref;
+        let target;
+        if (ref !== undefined && !this._refs[ref]) {
+          if (ref.startsWith('https://')) {
+            if (!this._allowFetch) {
+              throw new Error('Fetching remote schemas is not allowed (use --allow-fetch for force)');
+            }
+            const fetch = (await import('node-fetch')).default;
+
+            const fragSplit = ref.split('#');
+            const baseUrl = fragSplit[0];
+
+            target = this._refs[baseUrl];
+            if (!target) {
+              target = await this.resolveRefs(await fetch(ref).then(res => res.json()), baseUrl);
+              this._refs[baseUrl] = target;
+            }
+
+            if (fragSplit.length === 1 || fragSplit[fragSplit.length - 1] === '') {
+              return target;
+            }
+          } else if (ref.startsWith('#/')) {
+            target = schema;
+            ref = `${url}${ref}`;
+            n.$ref = ref;
+          } else {
+            throw new Error(`Unsupported ref ${ref}`);
+          }
+
+          const selectors = ref.split('#')[1].split('/').slice(1);
+          for (const sel of selectors) {
+            if (!target || !(sel in target)) {
+              throw new Error(`Error resolving ref ${ref}: ${sel} not in ${JSON.stringify(target)}`);
+            }
+            target = target[sel];
+          }
+
+          this._refs[ref] = target;
+        } else {
+          await Promise.all(Object.values(n).map(visit));
+        }
+      }
+
+      return n;
+    };
+
+    return visit(schema);
+  }
+
+  _generateUnionRule(name, altSchemas) {
+    return altSchemas
+      .map((altSchema, i) => this.visit(altSchema, `${name ?? ''}${name ? '-' : 'alternative-'}${i}`))
+      .join(' | ');
+  }
+
+  _visitPattern(pattern, name) {
+    if (!pattern.startsWith('^') || !pattern.endsWith('$')) {
+      throw new Error('Pattern must start with "^" and end with "$"');
+    }
+    pattern = pattern.slice(1, -1);
+    const subRuleIds = {};
+
+    let i = 0;
+    const length = pattern.length;
+
+    const getDot = () => {
+      let rule;
+      if (this._dotall) {
+        rule = '[\\U00000000-\\U0010FFFF]';
+      } else {
+        // Accept any character... except \n and \r line break chars (\x0A and \xOD)
+        rule = '[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]';
+      }
+      return this._addRule('dot', rule);
+    };
+
+
+    const toRule = ([s, isLiteral]) => isLiteral ? "\"" + s + "\"" : s;
+
+    const transform = () => {
+      const start = i;
+      // For each component of this sequence, store its string representation and whether it's a literal.
+      // We only need a flat structure here to apply repetition operators to the last item, and
+      // to merge literals at the and (we're parsing grouped ( sequences ) recursively and don't treat '|' specially
+      // (GBNF's syntax is luckily very close to regular expressions!)
+      const seq = [];
+
+      const joinSeq = () => {
+        const ret = [];
+        for (const [isLiteral, g] of groupBy(seq, x => x[1])) {
+          if (isLiteral) {
+            ret.push([[...g].map(x => x[0]).join(''), true]);
+          } else {
+            ret.push(...g);
+          }
+        }
+        if (ret.length === 1) {
+          return ret[0];
+        }
+        return [ret.map(x => toRule(x)).join(' '), false];
+      };
+
+      while (i < length) {
+        const c = pattern[i];
+        if (c === '.') {
+          seq.push([getDot(), false]);
+          i += 1;
+        } else if (c === '(') {
+          i += 1;
+          if (i < length) {
+            if (pattern[i] === '?') {
+              throw new Error(`Unsupported pattern syntax "${pattern[i]}" at index ${i} of /${pattern}/`);
+            }
+          }
+          seq.push([`(${toRule(transform())})`, false]);
+        } else if (c === ')') {
+          i += 1;
+          if (start <= 0 || pattern[start - 1] !== '(') {
+            throw new Error(`Unbalanced parentheses; start = ${start}, i = ${i}, pattern = ${pattern}`);
+          }
+          return joinSeq();
+        } else if (c === '[') {
+          let squareBrackets = c;
+          i += 1;
+          while (i < length && pattern[i] !== ']') {
+            if (pattern[i] === '\\') {
+              squareBrackets += pattern.slice(i, i + 2);
+              i += 2;
+            } else {
+              squareBrackets += pattern[i];
+              i += 1;
+            }
+          }
+          if (i >= length) {
+            throw new Error(`Unbalanced square brackets; start = ${start}, i = ${i}, pattern = ${pattern}`);
+          }
+          squareBrackets += ']';
+          i += 1;
+          seq.push([squareBrackets, false]);
+        } else if (c === '|') {
+          seq.push(['|', false]);
+          i += 1;
+        } else if (c === '*' || c === '+' || c === '?') {
+          seq[seq.length - 1] = [toRule(seq[seq.length - 1]) + c, false];
+          i += 1;
+        } else if (c === '{') {
+          let curlyBrackets = c;
+          i += 1;
+          while (i < length && pattern[i] !== '}') {
+            curlyBrackets += pattern[i];
+            i += 1;
+          }
+          if (i >= length) {
+            throw new Error(`Unbalanced curly brackets; start = ${start}, i = ${i}, pattern = ${pattern}`);
+          }
+          curlyBrackets += '}';
+          i += 1;
+          const nums = curlyBrackets.slice(1, -1).split(',').map(s => s.trim());
+          let minTimes, maxTimes;
+          if (nums.length === 1) {
+            minTimes = parseInt(nums[0], 10);
+            maxTimes = minTimes;
+          } else {
+            if (nums.length !== 2) {
+              throw new Error(`Invalid quantifier ${curlyBrackets}`);
+            }
+            minTimes = nums[0] ? parseInt(nums[0], 10) : 0;
+            maxTimes = nums[1] ? parseInt(nums[1], 10) : Infinity;
+          }
+
+          let [sub, subIsLiteral] = seq[seq.length - 1];
+
+          if (minTimes === 0 && maxTimes === Infinity) {
+            seq[seq.length - 1] = [`${sub}*`, false];
+          } else if (minTimes === 0 && maxTimes === 1) {
+            seq[seq.length - 1] = [`${sub}?`, false];
+          } else if (minTimes === 1 && maxTimes === Infinity) {
+            seq[seq.length - 1] = [`${sub}+`, false];
+          } else {
+            if (!subIsLiteral) {
+              let id = subRuleIds[sub];
+              if (id === undefined) {
+                id = this._addRule(`${name}-${Object.keys(subRuleIds).length + 1}`, sub);
+                subRuleIds[sub] = id;
+              }
+              sub = id;
+            }
+
+            const repeatedSub = Array.from({ length: minTimes }, () => subIsLiteral ? `"${sub.slice(1, -1).repeat(minTimes)}"` : sub);
+            const optionalSub = maxTimes !== undefined ? Array.from({ length: maxTimes - minTimes }, () => `${sub}?`) : [`${sub}*`];
+            seq[seq.length - 1] = [repeatedSub.concat(optionalSub).join(' '), false];
+          }
+        } else {
+          let literal = '';
+          while (i < length) {
+            if (pattern[i] === '\\' && i < length - 1) {
+              const next = pattern[i + 1];
+              if (ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.has(next)) {
+                i += 1;
+                literal += pattern[i];
+                i += 1;
+              } else {
+                literal += pattern.slice(i, i + 2);
+                i += 2;
+              }
+            } else if (pattern[i] === '"') {
+              literal += '\\"';
+              i += 1;
+            } else if (!NON_LITERAL_SET.has(pattern[i]) &&
+                (i === length - 1 || literal === '' || pattern[i + 1] === '.' || !NON_LITERAL_SET.has(pattern[i+1]))) {
+              literal += pattern[i];
+              i += 1;
+            } else {
+              break;
+            }
+          }
+          if (literal !== '') {
+            seq.push([literal, true]);
+          }
+        }
+      }
+
+      return joinSeq();
+    };
+
+    return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space")
+  }
+
+  _resolveRef(ref) {
+    let refName = ref.split('/').pop();
+    if (!(refName in this._rules) && !this._refsBeingResolved.has(ref)) {
+      this._refsBeingResolved.add(ref);
+      const resolved = this._refs[ref];
+      refName = this.visit(resolved, refName);
+      this._refsBeingResolved.delete(ref);
+    }
+    return refName;
+  }
+
+  _generateConstantRule(value) {
+    return this._formatLiteral(JSON.stringify(value));
+  }
+
  visit(schema, name) {
    const schemaType = schema.type;
-    const ruleName = name || 'root';
+    const schemaFormat = schema.format;
+    const ruleName = name in RESERVED_NAMES ? name + '-' : name == '' ? 'root' : name;

-    if (schema.oneOf || schema.anyOf) {
-      const rule = (schema.oneOf || schema.anyOf).map((altSchema, i) =>
-        this.visit(altSchema, `${name}${name ? "-" : ""}${i}`)
-      ).join(' | ');
-
-      return this._addRule(ruleName, rule);
+    const ref = schema.$ref;
+    if (ref !== undefined) {
+      return this._addRule(ruleName, this._resolveRef(ref));
+    } else if (schema.oneOf || schema.anyOf) {
+      return this._addRule(ruleName, this._generateUnionRule(name, schema.oneOf || schema.anyOf));
+    } else if (Array.isArray(schemaType)) {
+      return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({ type: t }))));
    } else if ('const' in schema) {
-      return this._addRule(ruleName, this._formatLiteral(schema.const));
+      return this._addRule(ruleName, this._generateConstantRule(schema.const));
    } else if ('enum' in schema) {
-      const rule = schema.enum.map(v => this._formatLiteral(v)).join(' | ');
+      const rule = schema.enum.map(v => this._generateConstantRule(v)).join(' | ');
      return this._addRule(ruleName, rule);
-    } else if (schemaType === 'object' && 'properties' in schema) {
-      // TODO: `required` keyword (from python implementation)
-      const propOrder = this._propOrder;
-      const propPairs = Object.entries(schema.properties).sort((a, b) => {
-        // sort by position in prop_order (if specified) then by key
-        const orderA = typeof propOrder[a[0]] === 'number' ? propOrder[a[0]] : Infinity;
-        const orderB = typeof propOrder[b[0]] === 'number' ? propOrder[b[0]] : Infinity;
-        return orderA - orderB || a[0].localeCompare(b[0]);
-      });
-
-      let rule = '"{" space';
-      propPairs.forEach(([propName, propSchema], i) => {
-        const propRuleName = this.visit(propSchema, `${name}${name ? "-" : ""}${propName}`);
-        if (i > 0) {
-          rule += ' "," space';
+    } else if ((schemaType === undefined || schemaType === 'object') &&
+               ('properties' in schema ||
+                ('additionalProperties' in schema && schema.additionalProperties !== true))) {
+      const required = new Set(schema.required || []);
+      const properties = Object.entries(schema.properties ?? {});
+      return this._addRule(ruleName, this._buildObjectRule(properties, required, name, schema.additionalProperties));
+    } else if ((schemaType === undefined || schemaType === 'object') && 'allOf' in schema) {
+      const required = new Set();
+      const properties = [];
+      const addComponent = (compSchema, isRequired) => {
+        const ref = compSchema.$ref;
+        if (ref !== undefined) {
+          compSchema = this._refs[ref];
        }
-        rule += ` ${this._formatLiteral(propName)} space ":" space ${propRuleName}`;
-      });
-      rule += ' "}" space';

-      return this._addRule(ruleName, rule);
-    } else if (schemaType === 'array' && 'items' in schema) {
-      // TODO `prefixItems` keyword (from python implementation)
-      const itemRuleName = this.visit(schema.items, `${name}${name ? "-" : ""}item`);
-      const rule = `"[" space (${itemRuleName} ("," space ${itemRuleName})*)? "]" space`;
-      return this._addRule(ruleName, rule);
+        if ('properties' in compSchema) {
+          for (const [propName, propSchema] of Object.entries(compSchema.properties)) {
+            properties.push([propName, propSchema]);
+            if (isRequired) {
+              required.add(propName);
+            }
+          }
+        }
+      };
+
+      for (const t of schema.allOf) {
+        if ('anyOf' in t) {
+          for (const tt of t.anyOf) {
+            addComponent(tt, false);
+          }
+        } else {
+          addComponent(t, true);
+        }
+      }
+
+      return this._addRule(ruleName, this._buildObjectRule(properties, required, name, /* additionalProperties= */ false));
+    } else if ((schemaType === undefined || schemaType === 'array') && ('items' in schema || 'prefixItems' in schema)) {
+      const items = schema.items ?? schema.prefixItems;
+      if (Array.isArray(items)) {
+        return this._addRule(
+          ruleName,
+          '"[" space ' +
+            items.map((item, i) => this.visit(item, `${name ?? ''}${name ? '-' : ''}tuple-${i}`)).join(' "," space ') +
+            ' "]" space'
+        );
+      } else {
+        const itemRuleName = this.visit(items, `${name ?? ''}${name ? '-' : ''}item`);
+        const listItemOperator = `( "," space ${itemRuleName} )`;
+        let successiveItems = '';
+        let minItems = schema.minItems || 0;
+        const maxItems = schema.maxItems;
+        if (minItems > 0) {
+          successiveItems = listItemOperator.repeat(minItems - 1);
+          minItems--;
+        }
+        if (maxItems !== undefined && maxItems > minItems) {
+          successiveItems += `${listItemOperator}?`.repeat(maxItems - minItems - 1);
+        } else {
+          successiveItems += `${listItemOperator}*`;
+        }
+        const rule = minItems === 0
+          ? `"[" space ( ${itemRuleName} ${successiveItems} )? "]" space`
+          : `"[" space ${itemRuleName} ${successiveItems} "]" space`;
+        return this._addRule(ruleName, rule);
+      }
+    } else if ((schemaType === undefined || schemaType === 'string') && 'pattern' in schema) {
+      return this._visitPattern(schema.pattern, ruleName);
+    } else if ((schemaType === undefined || schemaType === 'string') && /^uuid[1-5]?$/.test(schema.format || '')) {
+      return this._addRule(
+          ruleName === 'root' ? 'root' : schemaFormat,
+          PRIMITIVE_RULES['uuid'])
+    } else if ((schemaType === undefined || schemaType === 'string') && schema.format in DATE_RULES) {
+      for (const [t, r] of Object.entries(DATE_RULES)) {
+        this._addRule(t, r);
+      }
+      return schemaFormat + '-string';
+    } else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) {
+      for (const n of OBJECT_RULE_NAMES) {
+        this._addRule(n, PRIMITIVE_RULES[n]);
+      }
+      return this._addRule(ruleName, 'object');
    } else {
-      if (!PRIMITIVE_RULES[schemaType]) {
+      if (!(schemaType in PRIMITIVE_RULES)) {
        throw new Error(`Unrecognized schema: ${JSON.stringify(schema)}`);
      }
-      return this._addRule(
-        ruleName === 'root' ? 'root' : schemaType,
-        PRIMITIVE_RULES[schemaType]
+      // TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
+      return this._addRule(ruleName === 'root' ? 'root' : schemaType, PRIMITIVE_RULES[schemaType]);
+    }
+  }
+
+  _buildObjectRule(properties, required, name, additionalProperties) {
+    const propOrder = this._propOrder;
+    // sort by position in prop_order (if specified) then by original order
+    const sortedProps = properties.map(([k]) => k).sort((a, b) => {
+      const orderA = propOrder[a] || Infinity;
+      const orderB = propOrder[b] || Infinity;
+      return orderA - orderB || properties.findIndex(([k]) => k === a) - properties.findIndex(([k]) => k === b);
+    });
+
+    const propKvRuleNames = {};
+    for (const [propName, propSchema] of properties) {
+      const propRuleName = this.visit(propSchema, `${name ?? ''}${name ? '-' : ''}${propName}`);
+      propKvRuleNames[propName] = this._addRule(
+        `${name ?? ''}${name ? '-' : ''}${propName}-kv`,
+        `${this._formatLiteral(JSON.stringify(propName))} space ":" space ${propRuleName}`
      );
    }
+    const requiredProps = sortedProps.filter(k => required.has(k));
+    const optionalProps = sortedProps.filter(k => !required.has(k));
+
+    if (typeof additionalProperties === 'object' || additionalProperties === true) {
+      const subName = `${name ?? ''}${name ? '-' : ''}additional`;
+      const valueRule = this.visit(additionalProperties === true ? {} : additionalProperties, `${subName}-value`);
+      propKvRuleNames['*'] = this._addRule(
+        `${subName}-kv`,
+        `${this._addRule('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`);
+      optionalProps.push('*');
+    }
+
+    let rule = '"{" space ';
+    rule += requiredProps.map(k => propKvRuleNames[k]).join(' "," space ');
+
+    if (optionalProps.length > 0) {
+      rule += ' (';
+      if (requiredProps.length > 0) {
+        rule += ' "," space ( ';
+      }
+
+      const getRecursiveRefs = (ks, firstIsOptional) => {
+        const [k, ...rest] = ks;
+        const kvRuleName = propKvRuleNames[k];
+        let res;
+        if (k === '*') {
+            res = this._addRule(
+                `${name ?? ''}${name ? '-' : ''}additional-kvs`,
+                `${kvRuleName} ( "," space ` + kvRuleName + ` )*`
+            )
+        } else if (firstIsOptional) {
+          res = `( "," space ${kvRuleName} )?`;
+        } else {
+          res = kvRuleName;
+        }
+        if (rest.length > 0) {
+          res += ' ' + this._addRule(
+            `${name ?? ''}${name ? '-' : ''}${k}-rest`,
+            getRecursiveRefs(rest, true)
+          );
+        }
+        return res;
+      };
+
+      rule += optionalProps.map((_, i) => getRecursiveRefs(optionalProps.slice(i), false)).join(' | ');
+      if (requiredProps.length > 0) {
+        rule += ' )';
+      }
+      rule += ' )?';
+    }
+
+    rule += ' "}" space';
+
+    return rule;
  }

  formatGrammar() {
    let grammar = '';
-    this._rules.forEach((rule, name) => {
+    for (const [name, rule] of Object.entries(this._rules).sort(([a], [b]) => a.localeCompare(b))) {
      grammar += `${name} ::= ${rule}\n`;
-    });
+    }
    return grammar;
  }
 }
+
+// Helper function to group elements by a key function
+function* groupBy(iterable, keyFn) {
+  let lastKey = null;
+  let group = [];
+  for (const element of iterable) {
+    const key = keyFn(element);
+    if (lastKey !== null && key !== lastKey) {
+      yield [lastKey, group];
+      group = [];
+    }
+    group.push(element);
+    lastKey = key;
+  }
+  if (group.length > 0) {
+    yield [lastKey, group];
+  }
+}
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1,6 +1,7 @@
 #include "utils.hpp"

 #include "common.h"
+#include "json-schema-to-grammar.h"
 #include "llama.h"
 #include "build-info.h"
 #include "grammar-parser.h"
@ -30,7 +31,7 @@
 #include <signal.h>
 #include <memory>

-using json = nlohmann::json;
+using json = nlohmann::ordered_json;

 bool server_verbose = false;
 bool server_log_json = true;
@ -179,6 +180,7 @@ struct server_slot {
    llama_token sampled;
    struct llama_sampling_params sparams;
    llama_sampling_context * ctx_sampling = nullptr;
+    json json_schema;

    int32_t ga_i = 0;   // group-attention state
    int32_t ga_n = 1;   // group-attention factor
@ -846,10 +848,25 @@ struct server_context {
        slot.sparams.penalize_nl       = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
        slot.params.n_keep             = json_value(data, "n_keep",            slot.params.n_keep);
        slot.params.seed               = json_value(data, "seed",              default_params.seed);
-        slot.sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
        slot.sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
        slot.sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);

+        // process "json_schema" and "grammar"
+        if (data.contains("json_schema") && data.contains("grammar")) {
+            send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST);
+            return false;
+        } else if (data.contains("json_schema") && !data.contains("grammar")) {
+            try {
+                auto schema                = json_value(data, "json_schema", json::object());
+                slot.sparams.grammar       = json_schema_to_grammar(schema);
+            } catch (const std::exception & e) {
+                send_error(task, std::string("\"json_schema\": ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
+                return false;
+            }
+        } else {
+            slot.sparams.grammar       = json_value(data, "grammar",           default_sparams.grammar);
+        }
+
        if (slot.params.cache_prompt && slot.ga_n != 1) {
            LOG_WARNING("cache_prompt is not supported with group-attention", {});
            slot.params.cache_prompt = false;
@ -1236,7 +1253,7 @@ struct server_context {
            {"penalize_nl",               slot.sparams.penalize_nl},
            {"stop",                      slot.params.antiprompt},
            {"n_predict",                 slot.params.n_predict}, // TODO: fix duplicate key n_predict
-            {"n_keep",                    params.n_keep},
+            {"n_keep",                    slot.params.n_keep},
            {"ignore_eos",                ignore_eos},
            {"stream",                    slot.params.stream},
            {"logit_bias",                slot.sparams.logit_bias},
@ -1747,7 +1764,7 @@ struct server_context {
        }

        // process in chunks of params.n_batch
-        int32_t n_batch = llama_n_batch(ctx);
+        int32_t n_batch  = llama_n_batch(ctx);
        int32_t n_ubatch = llama_n_ubatch(ctx);

        // next, batch any pending prompts without exceeding n_batch
@ -2197,7 +2214,11 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
    printf("  -m FNAME, --model FNAME\n");
    printf("                            model path (default: %s)\n", params.model.c_str());
    printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
-    printf("                            model download url (default: %s)\n", params.model_url.c_str());
+    printf("                            model download url (default: unused)\n");
+    printf("  -hfr REPO, --hf-repo REPO\n");
+    printf("                            Hugging Face model repository (default: unused)\n");
+    printf("  -hff FILE, --hf-file FILE\n");
+    printf("                            Hugging Face model file (default: unused)\n");
    printf("  -a ALIAS, --alias ALIAS\n");
    printf("                            set an alias for the model, will be added as `model` field in completion response\n");
    printf("  --lora FNAME              apply LoRA adapter (implies --no-mmap)\n");
@ -2214,7 +2235,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
    printf("  -to N, --timeout N        server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
    printf("  --embeddings              enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
    printf("  -np N, --parallel N       number of slots for process requests (default: %d)\n", params.n_parallel);
-    printf("  -cb, --cont-batching      enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
+    printf("  -cb, --cont-batching      enable continuous batching (a.k.a dynamic batching) (default: enabled)\n");
    printf("  -spf FNAME, --system-prompt-file FNAME\n");
    printf("                            set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
    printf("  -ctk TYPE, --cache-type-k TYPE\n");
@ -2326,6 +2347,18 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                break;
            }
            params.model_url = argv[i];
+        } else if (arg == "-hfr" || arg == "--hf-repo") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.hf_repo = argv[i];
+        } else if (arg == "-hff" || arg == "--hf-file") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.hf_file = argv[i];
        } else if (arg == "-a" || arg == "--alias") {
            if (++i >= argc) {
                invalid_param = true;
--- a/examples/server/tests/features/parallel.feature
+++ b/examples/server/tests/features/parallel.feature
@ -4,7 +4,8 @@ Feature: Parallel

  Background: Server startup
    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
+    And   a model file test-model-00001-of-00003.gguf
    And   42 as server seed
    And   128 as batch size
    And   256 KV cache size
--- a/examples/server/tests/features/security.feature
+++ b/examples/server/tests/features/security.feature
@ -37,6 +37,22 @@ Feature: Security
      | llama.cpp | no        |
      | hackme    | raised    |

+  Scenario Outline: OAI Compatibility (invalid response formats)
+    Given a system prompt test
+    And   a user prompt test
+    And   a response format <response_format>
+    And   a model test
+    And   2 max tokens to predict
+    And   streaming is disabled
+    Given an OAI compatible chat completions request with raised api error
+
+    Examples: Prompts
+      | response_format                                       |
+      | {"type": "sound"}                                     |
+      | {"type": "json_object", "schema": 123}                |
+      | {"type": "json_object", "schema": {"type": 123}}      |
+      | {"type": "json_object", "schema": {"type": "hiccup"}} |
+

  Scenario Outline: CORS Options
    Given a user api key llama.cpp
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@ -4,8 +4,8 @@ Feature: llama.cpp server

  Background: Server startup
    Given a server listening on localhost:8080
-    And   a model url https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K.gguf
-    And   a model file stories260K.gguf
+    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   a model file test-model.gguf
    And   a model alias tinyllama-2
    And   42 as server seed
      # KV Cache corresponds to the total amount of tokens
@ -70,6 +70,22 @@ Feature: llama.cpp server
      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128        | (thanks\|happy\|bird\|Annabyear)+ | -1       | 64          | enabled          |           |


+  Scenario Outline: OAI Compatibility w/ response format
+    Given a model test
+    And   a system prompt test
+    And   a user prompt test
+    And   a response format <response_format>
+    And   10 max tokens to predict
+    Given an OAI compatible chat completions request with no api error
+    Then  <n_predicted> tokens are predicted matching <re_content>
+
+    Examples: Prompts
+      | response_format                                                     | n_predicted | re_content             |
+      | {"type": "json_object", "schema": {"const": "42"}}                  | 5           | "42"                   |
+      | {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10          | \[ -300 \]             |
+      | {"type": "json_object"}                                             | 10          | \{ " Jacky.            |
+
+
  Scenario: Tokenize / Detokenize
    When tokenizing:
    """
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -16,7 +16,6 @@ import numpy as np
 import openai
 from behave import step
 from behave.api.async_step import async_run_until_complete
-from huggingface_hub import hf_hub_download
 from prometheus_client import parser


@ -39,6 +38,8 @@ def step_server_config(context, server_fqdn, server_port):

    context.model_alias = None
    context.model_file = None
+    context.model_hf_repo = None
+    context.model_hf_file = None
    context.model_url = None
    context.n_batch = None
    context.n_ubatch = None
@ -59,6 +60,7 @@ def step_server_config(context, server_fqdn, server_port):
    context.seed = None
    context.server_seed = None
    context.user_api_key = None
+    context.response_format = None

    context.tasks_result = []
    context.concurrent_tasks = []
@ -67,9 +69,9 @@ def step_server_config(context, server_fqdn, server_port):

@step('a model file {hf_file} from HF repo {hf_repo}')
 def step_download_hf_model(context, hf_file, hf_repo):
-    context.model_file = hf_hub_download(repo_id=hf_repo, filename=hf_file)
-    if context.debug:
-        print(f"model file: {context.model_file}")
+    context.model_hf_repo = hf_repo
+    context.model_hf_file = hf_file
+    context.model_file = os.path.basename(hf_file)


@step('a model file {model_file}')
@ -269,6 +271,11 @@ def step_max_tokens(context, max_tokens):
    context.n_predict = max_tokens


+@step('a response format {response_format}')
+def step_response_format(context, response_format):
+    context.response_format = json.loads(response_format)
+
+
@step('streaming is {enable_streaming}')
 def step_streaming(context, enable_streaming):
    context.enable_streaming = enable_streaming == 'enabled'
@ -384,6 +391,9 @@ async def step_oai_chat_completions(context, api_error):
                                            enable_streaming=context.enable_streaming
                                            if hasattr(context, 'enable_streaming') else None,

+                                            response_format=context.response_format
+                                            if hasattr(context, 'response_format') else None,
+
                                            seed=await completions_seed(context),

                                            user_api_key=context.user_api_key
@ -443,6 +453,8 @@ async def step_oai_chat_completions(context):
                              if hasattr(context, 'n_predict') else None,
                              enable_streaming=context.enable_streaming
                              if hasattr(context, 'enable_streaming') else None,
+                              response_format=context.response_format
+                              if hasattr(context, 'response_format') else None,
                              seed=await completions_seed(context),
                              user_api_key=context.user_api_key
                              if hasattr(context, 'user_api_key') else None)
@ -463,6 +475,8 @@ async def step_oai_chat_completions(context):
                              if hasattr(context, 'n_predict') else None,
                              enable_streaming=context.enable_streaming
                              if hasattr(context, 'enable_streaming') else None,
+                              response_format=context.response_format
+                              if hasattr(context, 'response_format') else None,
                              seed=context.seed
                              if hasattr(context, 'seed') else
                              context.server_seed
@ -745,6 +759,7 @@ async def oai_chat_completions(user_prompt,
                               model=None,
                               n_predict=None,
                               enable_streaming=None,
+                               response_format=None,
                               seed=None,
                               user_api_key=None,
                               expect_api_error=None):
@ -770,6 +785,8 @@ async def oai_chat_completions(user_prompt,
        "stream": enable_streaming,
        "seed": seed
    }
+    if response_format is not None:
+        payload['response_format'] = response_format
    completion_response = {
        'content': '',
        'timings': {
@ -830,6 +847,7 @@ async def oai_chat_completions(user_prompt,
                model=model,
                max_tokens=n_predict,
                stream=enable_streaming,
+                response_format=payload.get('response_format'),
                seed=seed
            )
        except openai.error.AuthenticationError as e:
@ -1062,6 +1080,10 @@ def start_server_background(context):
        server_args.extend(['--model', context.model_file])
    if context.model_url:
        server_args.extend(['--model-url', context.model_url])
+    if context.model_hf_repo:
+        server_args.extend(['--hf-repo', context.model_hf_repo])
+    if context.model_hf_file:
+        server_args.extend(['--hf-file', context.model_hf_file])
    if context.n_batch:
        server_args.extend(['--batch-size', context.n_batch])
    if context.n_ubatch:
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -12,7 +12,7 @@

 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"

-using json = nlohmann::json;
+using json = nlohmann::ordered_json;

 // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
 enum error_type {
@ -95,8 +95,8 @@ static inline void server_log(const char *level, const char *function, int line,

        const std::string str = ss.str();
        printf("%.*s\n", (int)str.size(), str.data());
-        fflush(stdout);
    }
+    fflush(stdout);
 }

 //
@ -352,40 +352,71 @@ static json oaicompat_completion_params_parse(
    // https://platform.openai.com/docs/api-reference/chat/create
    llama_sampling_params default_sparams;
    llama_params["model"]             = json_value(body,   "model",             std::string("unknown"));
-    llama_params["prompt"]            = format_chat(model, chat_template,       body["messages"]);
-    llama_params["cache_prompt"]      = json_value(body,   "cache_prompt",      false);
-    llama_params["temperature"]       = json_value(body,   "temperature",       0.0);
-    llama_params["top_k"]             = json_value(body,   "top_k",             default_sparams.top_k);
-    llama_params["top_p"]             = json_value(body,   "top_p",             1.0);
-    llama_params["n_predict"]         = json_value(body,   "max_tokens",        -1);
-    llama_params["logit_bias"]        = json_value(body,   "logit_bias",        json::object());
    llama_params["frequency_penalty"] = json_value(body,   "frequency_penalty", 0.0);
+    llama_params["logit_bias"]        = json_value(body,   "logit_bias",        json::object());
+    llama_params["n_predict"]         = json_value(body,   "max_tokens",        -1);
    llama_params["presence_penalty"]  = json_value(body,   "presence_penalty",  0.0);
    llama_params["seed"]              = json_value(body,   "seed",              LLAMA_DEFAULT_SEED);
    llama_params["stream"]            = json_value(body,   "stream",            false);
-    llama_params["mirostat"]          = json_value(body,   "mirostat",          default_sparams.mirostat);
-    llama_params["mirostat_tau"]      = json_value(body,   "mirostat_tau",      default_sparams.mirostat_tau);
-    llama_params["mirostat_eta"]      = json_value(body,   "mirostat_eta",      default_sparams.mirostat_eta);
-    llama_params["penalize_nl"]       = json_value(body,   "penalize_nl",       default_sparams.penalize_nl);
-    llama_params["typical_p"]         = json_value(body,   "typical_p",         default_sparams.typical_p);
-    llama_params["repeat_last_n"]     = json_value(body,   "repeat_last_n",     default_sparams.penalty_last_n);
-    llama_params["ignore_eos"]        = json_value(body,   "ignore_eos",        false);
-    llama_params["tfs_z"]             = json_value(body,   "tfs_z",             default_sparams.tfs_z);
-    llama_params["n_keep"]            = json_value(body,   "n_keep",            0);
+    llama_params["temperature"]       = json_value(body,   "temperature",       0.0);
+    llama_params["top_p"]             = json_value(body,   "top_p",             1.0);

-    if (body.count("grammar") != 0) {
-        llama_params["grammar"] = json_value(body, "grammar", json::object());
-    }
+    // Apply chat template to the list of messages
+    llama_params["prompt"] = format_chat(model, chat_template, body["messages"]);

-    // Handle 'stop' field
+    // Handle "stop" field
    if (body.contains("stop") && body["stop"].is_string()) {
        llama_params["stop"] = json::array({body["stop"].get<std::string>()});
    } else {
        llama_params["stop"] = json_value(body, "stop", json::array());
    }
+    // Some chat templates don't use EOS token to stop generation
+    // We must add their end sequences to list of stop words
+    llama_params["stop"].push_back("<|im_end|>"); // chatml
+    llama_params["stop"].push_back("<end_of_turn>"); // gemma

-    // Ensure there is ChatML-specific end sequence among stop words
-    llama_params["stop"].push_back("<|im_end|>");
+    // Handle "response_format" field
+    if (body.contains("response_format")) {
+        json response_format      = json_value(body, "response_format", json::object());
+        std::string response_type = json_value(response_format, "type", std::string());
+        if (response_type == "json_object") {
+            llama_params["json_schema"] = json_value(response_format, "schema", json::object());
+        } else if (!response_type.empty() && response_type != "text") {
+            throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
+        }
+    }
+
+    // Handle "n" field
+    int n_choices = json_value(body, "n", 1);
+    if (n_choices != 1) {
+        throw std::runtime_error("Only one completion choice is allowed");
+    }
+
+    // Handle "logprobs" field
+    // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
+    if (body.contains("logprobs")) {
+        llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
+    } else if (body.contains("top_logprobs")) {
+        throw std::runtime_error("top_logprobs requires logprobs to be set to true");
+    }
+
+    // Params supported by OAI but unsupported by llama.cpp
+    static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
+    for (auto & param : unsupported_params) {
+        if (body.contains(param)) {
+            throw std::runtime_error("Unsupported param: " + param);
+        }
+    }
+
+    // Copy remaining properties to llama_params
+    // This allows user to use llama.cpp-specific params like "mirostat", "tfs_z",... via OAI endpoint.
+    // See "launch_slot_with_task()" for a complete list of params supported by llama.cpp
+    for (const auto & item : body.items()) {
+        // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
+        if (!llama_params.contains(item.key()) || item.key() == "n_predict") {
+            llama_params[item.key()] = item.value();
+        }
+    }

    return llama_params;
 }