mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-10 04:00:53 +00:00
jinja : support ensure_ascii=true, string repetition and int/float self-filtering (#21623)
* feat: jinja engine improvements for reka-edge
Port three Jinja engine improvements needed for the reka-edge model:
1. Python-style string repetition ("ab" * 3 → "ababab")
2. ensure_ascii=true support for tojson filter (escapes non-ASCII to \uXXXX)
3. int() builtin on value_int_t (identity, needed for Reka Edge template)
* fix: escape invalid utf8 bytes when ensure_ascii=true
The json_ensure_ascii_preserving_format function does not correctly
handle an edge case where if UTF-8 parsing fails, it adds the non-ascii
character back to the output as a raw byte.
This commit fixes that by adding the unicode standard replacement
character \\ufffd to the output instead. This is the standard behavior
for various programming languages like Python, Rust, Go, etc.
* chore: address PR comments
1. Add todo comment for supporting string repetition for array/tuples
2. Add support for float identity operation
3. Move invalid ascii test case to test_fuzzing
* chore: accept suggestion for common/jinja/value.cpp
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
---------
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
parent
5e9c635463
commit
243532e556
3 changed files with 160 additions and 3 deletions
|
|
@ -447,6 +447,18 @@ static void test_expressions(testing & t) {
|
|||
"hello world"
|
||||
);
|
||||
|
||||
test_template(t, "string repetition",
|
||||
"{{ 'ab' * 3 }}",
|
||||
json::object(),
|
||||
"ababab"
|
||||
);
|
||||
|
||||
test_template(t, "reversed string repetition",
|
||||
"{{ 3 * 'ab' }}",
|
||||
json::object(),
|
||||
"ababab"
|
||||
);
|
||||
|
||||
test_template(t, "ternary",
|
||||
"{{ 'yes' if cond else 'no' }}",
|
||||
{{"cond", true}},
|
||||
|
|
@ -693,6 +705,33 @@ static void test_filters(testing & t) {
|
|||
"\"\\u2713\""
|
||||
);
|
||||
|
||||
test_template(t, "tojson ensure_ascii=true nested object",
|
||||
"{{ data|tojson(ensure_ascii=true) }}",
|
||||
{{"data", {
|
||||
{"text", "\u2713"},
|
||||
{"items", json::array({"é", {{"snowman", "☃"}}})}
|
||||
}}},
|
||||
"{\"text\": \"\\u2713\", \"items\": [\"\\u00e9\", {\"snowman\": \"\\u2603\"}]}"
|
||||
);
|
||||
|
||||
test_template(t, "tojson ensure_ascii=true indent=2",
|
||||
"{{ data|tojson(ensure_ascii=true, indent=2) }}",
|
||||
{{"data", {
|
||||
{"text", "\u2713"},
|
||||
{"nested", {{"accent", "é"}}}
|
||||
}}},
|
||||
"{\n \"text\": \"\\u2713\",\n \"nested\": {\n \"accent\": \"\\u00e9\"\n }\n}"
|
||||
);
|
||||
|
||||
test_template(t, "tojson ensure_ascii=true preserves existing escapes",
|
||||
"{{ data|tojson(ensure_ascii=true) }}",
|
||||
{{"data", {
|
||||
{"emoji", "😀"},
|
||||
{"line", "a\nb"}
|
||||
}}},
|
||||
"{\"emoji\": \"\\ud83d\\ude00\", \"line\": \"a\\nb\"}"
|
||||
);
|
||||
|
||||
test_template(t, "tojson sort_keys=true",
|
||||
"{{ data|tojson(sort_keys=true) }}",
|
||||
{{"data", {{"b", 2}, {"a", 1}}}},
|
||||
|
|
@ -771,6 +810,12 @@ static void test_filters(testing & t) {
|
|||
"hello"
|
||||
);
|
||||
|
||||
test_template(t, "int filter on integer is identity",
|
||||
"{{ value|int }}",
|
||||
{{"value", 7}},
|
||||
"7"
|
||||
);
|
||||
|
||||
test_template(t, "none to string",
|
||||
"{{ x|string }}",
|
||||
{{"x", nullptr}},
|
||||
|
|
@ -2458,4 +2503,12 @@ static void test_fuzzing(testing & t) {
|
|||
t.assert_true("builtin " + type_name + "." + fn_name + " #" + std::to_string(i), fuzz_test_template(tmpl, vars));
|
||||
}
|
||||
});
|
||||
|
||||
t.test("tojson ensure_ascii=true with invalid utf-8", [&](testing & t) {
|
||||
t.assert_true("invalid utf-8 does not crash",
|
||||
fuzz_test_template(
|
||||
"{{ data|tojson(ensure_ascii=true) }}",
|
||||
{{"data", std::string("hello\xfe\xffworld")}}
|
||||
));
|
||||
});
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue