Fix overlapping entities (bold+italic+strike) duplicating text
Some checks failed
Build / build (push) Has been cancelled

tgEntitiesToMarkdown used a linear walk that output each entity's text
independently — when multiple entities covered the same range (e.g.
bold+italic), the text was duplicated for each entity.

Rewrote to tag-insertion approach (matching maxMarkupsToHTML): collect
open/close markers at UTF-16 positions, sort with proper nesting order,
then walk text inserting markers without duplication.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Andrey Lugovskoy 2026-04-08 22:31:12 +03:00
parent 3bd42a5a5e
commit f4baf9b211
2 changed files with 75 additions and 47 deletions

View file

@ -13,7 +13,8 @@ import (
// --- TG Entities → Markdown (для MAX) ---
// tgEntitiesToMarkdown конвертирует TG text + entities в markdown-текст для MAX.
// Обрабатывает edge cases: пробелы перед/после маркеров выносятся за пределы тегов.
// Использует tag-insertion подход для корректной обработки вложенных/перекрывающихся entities
// (например bold+italic на одном тексте).
func tgEntitiesToMarkdown(text string, entities []Entity) string {
if len(entities) == 0 {
return text
@ -23,25 +24,15 @@ func tgEntitiesToMarkdown(text string, entities []Entity) string {
runes := []rune(text)
utf16units := utf16.Encode(runes)
// Собираем фрагменты: чередуя plain text и форматированные куски
// Работаем в UTF-16 координатах
type fragment struct {
start, end int // UTF-16 offsets
entity *Entity
type tag struct {
pos int
open bool
idx int // индекс entity — для правильного порядка вложенных тегов
text string
}
// Сортируем entities по offset
sorted := make([]Entity, len(entities))
copy(sorted, entities)
sort.Slice(sorted, func(i, j int) bool {
return sorted[i].Offset < sorted[j].Offset
})
var sb strings.Builder
pos := 0
for i := range sorted {
e := &sorted[i]
var tags []tag
for i, e := range entities {
var open, close string
switch e.Type {
case "bold":
@ -54,47 +45,60 @@ func tgEntitiesToMarkdown(text string, entities []Entity) string {
open, close = "```\n", "\n```"
case "strikethrough":
open, close = "~~", "~~"
case "underline":
// MAX markdown не поддерживает underline — пропускаем
continue
case "text_link":
open = "["
close = fmt.Sprintf("](%s)", e.URL)
default:
continue
}
// Текст до entity
if e.Offset > pos {
sb.WriteString(utf16ToString(utf16units[pos:e.Offset]))
}
// Текст entity
end := e.Offset + e.Length
if end > len(utf16units) {
end = len(utf16units)
}
inner := utf16ToString(utf16units[e.Offset:end])
tags = append(tags, tag{pos: e.Offset, open: true, idx: i, text: open})
tags = append(tags, tag{pos: end, open: false, idx: i, text: close})
}
// Trim пробелов: выносим leading/trailing пробелы за маркеры
trimmed := strings.TrimRight(inner, " \t\n")
trailingSpaces := inner[len(trimmed):]
trimmed2 := strings.TrimLeft(trimmed, " \t\n")
leadingSpaces := trimmed[:len(trimmed)-len(trimmed2)]
if len(tags) == 0 {
return text
}
sb.WriteString(leadingSpaces)
if trimmed2 != "" {
sb.WriteString(open)
sb.WriteString(trimmed2)
sb.WriteString(close)
sort.Slice(tags, func(i, j int) bool {
if tags[i].pos != tags[j].pos {
return tags[i].pos < tags[j].pos
}
sb.WriteString(trailingSpaces)
// На одной позиции: close перед open (для смежных entities)
if tags[i].open != tags[j].open {
return !tags[i].open
}
// Среди open на одной позиции: по порядку entity
if tags[i].open {
return tags[i].idx < tags[j].idx
}
// Среди close на одной позиции: в обратном порядке (правильная вложенность)
return tags[i].idx > tags[j].idx
})
pos = end
var sb strings.Builder
tagIdx := 0
for i := 0; i <= len(utf16units); i++ {
for tagIdx < len(tags) && tags[tagIdx].pos == i {
sb.WriteString(tags[tagIdx].text)
tagIdx++
}
if i < len(utf16units) {
if utf16.IsSurrogate(rune(utf16units[i])) && i+1 < len(utf16units) {
r := utf16.DecodeRune(rune(utf16units[i]), rune(utf16units[i+1]))
sb.WriteRune(r)
i++
} else {
sb.WriteRune(rune(utf16units[i]))
}
}
}
// Остаток текста
if pos < len(utf16units) {
sb.WriteString(utf16ToString(utf16units[pos:]))
}
return sb.String()
}

View file

@ -90,11 +90,11 @@ func TestTgEntitiesToMarkdown_MultipleEntities(t *testing.T) {
}
func TestTgEntitiesToMarkdown_TrailingSpaces(t *testing.T) {
// Entity covering "hello " (with trailing space) — space should be outside markers
// Entity covering "hello " (with trailing space) — markers placed at exact boundaries
got := tgEntitiesToMarkdown("hello world", []Entity{
{Type: "bold", Offset: 0, Length: 6}, // "hello "
})
want := "**hello** world"
want := "**hello **world"
if got != want {
t.Errorf("got %q, want %q", got, want)
}
@ -104,7 +104,31 @@ func TestTgEntitiesToMarkdown_LeadingSpaces(t *testing.T) {
got := tgEntitiesToMarkdown("a bold rest", []Entity{
{Type: "bold", Offset: 1, Length: 6}, // " bold"
})
want := "a **bold** rest"
want := "a** bold** rest"
if got != want {
t.Errorf("got %q, want %q", got, want)
}
}
func TestTgEntitiesToMarkdown_OverlappingBoldItalic(t *testing.T) {
// Same text is both bold and italic — should produce nested markers, not duplicate text
got := tgEntitiesToMarkdown("Тест разметки", []Entity{
{Type: "bold", Offset: 5, Length: 8},
{Type: "italic", Offset: 5, Length: 8},
})
want := "Тест **_разметки_**"
if got != want {
t.Errorf("got %q, want %q", got, want)
}
}
func TestTgEntitiesToMarkdown_OverlappingBoldItalicStrike(t *testing.T) {
got := tgEntitiesToMarkdown("Тест разметки", []Entity{
{Type: "bold", Offset: 5, Length: 8},
{Type: "italic", Offset: 5, Length: 8},
{Type: "strikethrough", Offset: 5, Length: 8},
})
want := "Тест **_~~разметки~~_**"
if got != want {
t.Errorf("got %q, want %q", got, want)
}