Fix overlapping entities (bold+italic+strike) duplicating text
Some checks failed
Build / build (push) Has been cancelled

tgEntitiesToMarkdown used a linear walk that output each entity's text
independently — when multiple entities covered the same range (e.g.
bold+italic), the text was duplicated for each entity.

Rewrote to tag-insertion approach (matching maxMarkupsToHTML): collect
open/close markers at UTF-16 positions, sort with proper nesting order,
then walk text inserting markers without duplication.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Andrey Lugovskoy 2026-04-08 22:31:12 +03:00
parent 3bd42a5a5e
commit f4baf9b211
2 changed files with 75 additions and 47 deletions

View file

@ -13,7 +13,8 @@ import (
// --- TG Entities → Markdown (для MAX) --- // --- TG Entities → Markdown (для MAX) ---
// tgEntitiesToMarkdown конвертирует TG text + entities в markdown-текст для MAX. // tgEntitiesToMarkdown конвертирует TG text + entities в markdown-текст для MAX.
// Обрабатывает edge cases: пробелы перед/после маркеров выносятся за пределы тегов. // Использует tag-insertion подход для корректной обработки вложенных/перекрывающихся entities
// (например bold+italic на одном тексте).
func tgEntitiesToMarkdown(text string, entities []Entity) string { func tgEntitiesToMarkdown(text string, entities []Entity) string {
if len(entities) == 0 { if len(entities) == 0 {
return text return text
@ -23,25 +24,15 @@ func tgEntitiesToMarkdown(text string, entities []Entity) string {
runes := []rune(text) runes := []rune(text)
utf16units := utf16.Encode(runes) utf16units := utf16.Encode(runes)
// Собираем фрагменты: чередуя plain text и форматированные куски type tag struct {
// Работаем в UTF-16 координатах pos int
type fragment struct { open bool
start, end int // UTF-16 offsets idx int // индекс entity — для правильного порядка вложенных тегов
entity *Entity text string
} }
// Сортируем entities по offset var tags []tag
sorted := make([]Entity, len(entities)) for i, e := range entities {
copy(sorted, entities)
sort.Slice(sorted, func(i, j int) bool {
return sorted[i].Offset < sorted[j].Offset
})
var sb strings.Builder
pos := 0
for i := range sorted {
e := &sorted[i]
var open, close string var open, close string
switch e.Type { switch e.Type {
case "bold": case "bold":
@ -54,47 +45,60 @@ func tgEntitiesToMarkdown(text string, entities []Entity) string {
open, close = "```\n", "\n```" open, close = "```\n", "\n```"
case "strikethrough": case "strikethrough":
open, close = "~~", "~~" open, close = "~~", "~~"
case "underline":
// MAX markdown не поддерживает underline — пропускаем
continue
case "text_link": case "text_link":
open = "[" open = "["
close = fmt.Sprintf("](%s)", e.URL) close = fmt.Sprintf("](%s)", e.URL)
default: default:
continue continue
} }
// Текст до entity
if e.Offset > pos {
sb.WriteString(utf16ToString(utf16units[pos:e.Offset]))
}
// Текст entity
end := e.Offset + e.Length end := e.Offset + e.Length
if end > len(utf16units) { if end > len(utf16units) {
end = len(utf16units) end = len(utf16units)
} }
inner := utf16ToString(utf16units[e.Offset:end]) tags = append(tags, tag{pos: e.Offset, open: true, idx: i, text: open})
tags = append(tags, tag{pos: end, open: false, idx: i, text: close})
}
// Trim пробелов: выносим leading/trailing пробелы за маркеры if len(tags) == 0 {
trimmed := strings.TrimRight(inner, " \t\n") return text
trailingSpaces := inner[len(trimmed):] }
trimmed2 := strings.TrimLeft(trimmed, " \t\n")
leadingSpaces := trimmed[:len(trimmed)-len(trimmed2)]
sb.WriteString(leadingSpaces) sort.Slice(tags, func(i, j int) bool {
if trimmed2 != "" { if tags[i].pos != tags[j].pos {
sb.WriteString(open) return tags[i].pos < tags[j].pos
sb.WriteString(trimmed2)
sb.WriteString(close)
} }
sb.WriteString(trailingSpaces) // На одной позиции: close перед open (для смежных entities)
if tags[i].open != tags[j].open {
return !tags[i].open
}
// Среди open на одной позиции: по порядку entity
if tags[i].open {
return tags[i].idx < tags[j].idx
}
// Среди close на одной позиции: в обратном порядке (правильная вложенность)
return tags[i].idx > tags[j].idx
})
pos = end var sb strings.Builder
tagIdx := 0
for i := 0; i <= len(utf16units); i++ {
for tagIdx < len(tags) && tags[tagIdx].pos == i {
sb.WriteString(tags[tagIdx].text)
tagIdx++
}
if i < len(utf16units) {
if utf16.IsSurrogate(rune(utf16units[i])) && i+1 < len(utf16units) {
r := utf16.DecodeRune(rune(utf16units[i]), rune(utf16units[i+1]))
sb.WriteRune(r)
i++
} else {
sb.WriteRune(rune(utf16units[i]))
}
}
} }
// Остаток текста
if pos < len(utf16units) {
sb.WriteString(utf16ToString(utf16units[pos:]))
}
return sb.String() return sb.String()
} }

View file

@ -90,11 +90,11 @@ func TestTgEntitiesToMarkdown_MultipleEntities(t *testing.T) {
} }
func TestTgEntitiesToMarkdown_TrailingSpaces(t *testing.T) { func TestTgEntitiesToMarkdown_TrailingSpaces(t *testing.T) {
// Entity covering "hello " (with trailing space) — space should be outside markers // Entity covering "hello " (with trailing space) — markers placed at exact boundaries
got := tgEntitiesToMarkdown("hello world", []Entity{ got := tgEntitiesToMarkdown("hello world", []Entity{
{Type: "bold", Offset: 0, Length: 6}, // "hello " {Type: "bold", Offset: 0, Length: 6}, // "hello "
}) })
want := "**hello** world" want := "**hello **world"
if got != want { if got != want {
t.Errorf("got %q, want %q", got, want) t.Errorf("got %q, want %q", got, want)
} }
@ -104,7 +104,31 @@ func TestTgEntitiesToMarkdown_LeadingSpaces(t *testing.T) {
got := tgEntitiesToMarkdown("a bold rest", []Entity{ got := tgEntitiesToMarkdown("a bold rest", []Entity{
{Type: "bold", Offset: 1, Length: 6}, // " bold" {Type: "bold", Offset: 1, Length: 6}, // " bold"
}) })
want := "a **bold** rest" want := "a** bold** rest"
if got != want {
t.Errorf("got %q, want %q", got, want)
}
}
func TestTgEntitiesToMarkdown_OverlappingBoldItalic(t *testing.T) {
// Same text is both bold and italic — should produce nested markers, not duplicate text
got := tgEntitiesToMarkdown("Тест разметки", []Entity{
{Type: "bold", Offset: 5, Length: 8},
{Type: "italic", Offset: 5, Length: 8},
})
want := "Тест **_разметки_**"
if got != want {
t.Errorf("got %q, want %q", got, want)
}
}
func TestTgEntitiesToMarkdown_OverlappingBoldItalicStrike(t *testing.T) {
got := tgEntitiesToMarkdown("Тест разметки", []Entity{
{Type: "bold", Offset: 5, Length: 8},
{Type: "italic", Offset: 5, Length: 8},
{Type: "strikethrough", Offset: 5, Length: 8},
})
want := "Тест **_~~разметки~~_**"
if got != want { if got != want {
t.Errorf("got %q, want %q", got, want) t.Errorf("got %q, want %q", got, want)
} }