jinja : support ensure_ascii=true, string repetition and int/float self-filtering (#21623)

* feat: jinja engine improvements for reka-edge

Port three Jinja engine improvements needed for the reka-edge model:
1. Python-style string repetition ("ab" * 3 → "ababab")
2. ensure_ascii=true support for tojson filter (escapes non-ASCII to \uXXXX)
3. int() builtin on value_int_t (identity, needed for Reka Edge template)

* fix: escape invalid utf8 bytes when ensure_ascii=true

The json_ensure_ascii_preserving_format function does not correctly
handle an edge case where if UTF-8 parsing fails, it adds the non-ascii
character back to the output as a raw byte.

This commit fixes that by adding the unicode standard replacement
character \\ufffd to the output instead. This is the standard behavior
for various programming languages like Python, Rust, Go, etc.

* chore: address PR comments

1. Add todo comment for supporting string repetition for array/tuples
2. Add support for float identity operation
3. Move invalid ascii test case to test_fuzzing

* chore: accept suggestion for common/jinja/value.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
Kwa Jie Hao 2026-04-09 17:28:33 +08:00 committed by GitHub
parent 5e9c635463
commit 243532e556
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 160 additions and 3 deletions

View file

@ -447,6 +447,18 @@ static void test_expressions(testing & t) {
"hello world"
);
test_template(t, "string repetition",
"{{ 'ab' * 3 }}",
json::object(),
"ababab"
);
test_template(t, "reversed string repetition",
"{{ 3 * 'ab' }}",
json::object(),
"ababab"
);
test_template(t, "ternary",
"{{ 'yes' if cond else 'no' }}",
{{"cond", true}},
@ -693,6 +705,33 @@ static void test_filters(testing & t) {
"\"\\u2713\""
);
test_template(t, "tojson ensure_ascii=true nested object",
"{{ data|tojson(ensure_ascii=true) }}",
{{"data", {
{"text", "\u2713"},
{"items", json::array({"é", {{"snowman", ""}}})}
}}},
"{\"text\": \"\\u2713\", \"items\": [\"\\u00e9\", {\"snowman\": \"\\u2603\"}]}"
);
test_template(t, "tojson ensure_ascii=true indent=2",
"{{ data|tojson(ensure_ascii=true, indent=2) }}",
{{"data", {
{"text", "\u2713"},
{"nested", {{"accent", "é"}}}
}}},
"{\n \"text\": \"\\u2713\",\n \"nested\": {\n \"accent\": \"\\u00e9\"\n }\n}"
);
test_template(t, "tojson ensure_ascii=true preserves existing escapes",
"{{ data|tojson(ensure_ascii=true) }}",
{{"data", {
{"emoji", "😀"},
{"line", "a\nb"}
}}},
"{\"emoji\": \"\\ud83d\\ude00\", \"line\": \"a\\nb\"}"
);
test_template(t, "tojson sort_keys=true",
"{{ data|tojson(sort_keys=true) }}",
{{"data", {{"b", 2}, {"a", 1}}}},
@ -771,6 +810,12 @@ static void test_filters(testing & t) {
"hello"
);
test_template(t, "int filter on integer is identity",
"{{ value|int }}",
{{"value", 7}},
"7"
);
test_template(t, "none to string",
"{{ x|string }}",
{{"x", nullptr}},
@ -2458,4 +2503,12 @@ static void test_fuzzing(testing & t) {
t.assert_true("builtin " + type_name + "." + fn_name + " #" + std::to_string(i), fuzz_test_template(tmpl, vars));
}
});
t.test("tojson ensure_ascii=true with invalid utf-8", [&](testing & t) {
t.assert_true("invalid utf-8 does not crash",
fuzz_test_template(
"{{ data|tojson(ensure_ascii=true) }}",
{{"data", std::string("hello\xfe\xffworld")}}
));
});
}