mirror of
https://github.com/BEARlogin/max-telegram-bridge-bot.git
synced 2026-04-28 03:39:46 +00:00
Fix overlapping entities (bold+italic+strike) duplicating text
Some checks failed
Build / build (push) Has been cancelled
Some checks failed
Build / build (push) Has been cancelled
tgEntitiesToMarkdown used a linear walk that output each entity's text independently — when multiple entities covered the same range (e.g. bold+italic), the text was duplicated for each entity. Rewrote to tag-insertion approach (matching maxMarkupsToHTML): collect open/close markers at UTF-16 positions, sort with proper nesting order, then walk text inserting markers without duplication. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
3bd42a5a5e
commit
f4baf9b211
2 changed files with 75 additions and 47 deletions
92
markup.go
92
markup.go
|
|
@ -13,7 +13,8 @@ import (
|
||||||
// --- TG Entities → Markdown (для MAX) ---
|
// --- TG Entities → Markdown (для MAX) ---
|
||||||
|
|
||||||
// tgEntitiesToMarkdown конвертирует TG text + entities в markdown-текст для MAX.
|
// tgEntitiesToMarkdown конвертирует TG text + entities в markdown-текст для MAX.
|
||||||
// Обрабатывает edge cases: пробелы перед/после маркеров выносятся за пределы тегов.
|
// Использует tag-insertion подход для корректной обработки вложенных/перекрывающихся entities
|
||||||
|
// (например bold+italic на одном тексте).
|
||||||
func tgEntitiesToMarkdown(text string, entities []Entity) string {
|
func tgEntitiesToMarkdown(text string, entities []Entity) string {
|
||||||
if len(entities) == 0 {
|
if len(entities) == 0 {
|
||||||
return text
|
return text
|
||||||
|
|
@ -23,25 +24,15 @@ func tgEntitiesToMarkdown(text string, entities []Entity) string {
|
||||||
runes := []rune(text)
|
runes := []rune(text)
|
||||||
utf16units := utf16.Encode(runes)
|
utf16units := utf16.Encode(runes)
|
||||||
|
|
||||||
// Собираем фрагменты: чередуя plain text и форматированные куски
|
type tag struct {
|
||||||
// Работаем в UTF-16 координатах
|
pos int
|
||||||
type fragment struct {
|
open bool
|
||||||
start, end int // UTF-16 offsets
|
idx int // индекс entity — для правильного порядка вложенных тегов
|
||||||
entity *Entity
|
text string
|
||||||
}
|
}
|
||||||
|
|
||||||
// Сортируем entities по offset
|
var tags []tag
|
||||||
sorted := make([]Entity, len(entities))
|
for i, e := range entities {
|
||||||
copy(sorted, entities)
|
|
||||||
sort.Slice(sorted, func(i, j int) bool {
|
|
||||||
return sorted[i].Offset < sorted[j].Offset
|
|
||||||
})
|
|
||||||
|
|
||||||
var sb strings.Builder
|
|
||||||
pos := 0
|
|
||||||
|
|
||||||
for i := range sorted {
|
|
||||||
e := &sorted[i]
|
|
||||||
var open, close string
|
var open, close string
|
||||||
switch e.Type {
|
switch e.Type {
|
||||||
case "bold":
|
case "bold":
|
||||||
|
|
@ -54,47 +45,60 @@ func tgEntitiesToMarkdown(text string, entities []Entity) string {
|
||||||
open, close = "```\n", "\n```"
|
open, close = "```\n", "\n```"
|
||||||
case "strikethrough":
|
case "strikethrough":
|
||||||
open, close = "~~", "~~"
|
open, close = "~~", "~~"
|
||||||
|
case "underline":
|
||||||
|
// MAX markdown не поддерживает underline — пропускаем
|
||||||
|
continue
|
||||||
case "text_link":
|
case "text_link":
|
||||||
open = "["
|
open = "["
|
||||||
close = fmt.Sprintf("](%s)", e.URL)
|
close = fmt.Sprintf("](%s)", e.URL)
|
||||||
default:
|
default:
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// Текст до entity
|
|
||||||
if e.Offset > pos {
|
|
||||||
sb.WriteString(utf16ToString(utf16units[pos:e.Offset]))
|
|
||||||
}
|
|
||||||
|
|
||||||
// Текст entity
|
|
||||||
end := e.Offset + e.Length
|
end := e.Offset + e.Length
|
||||||
if end > len(utf16units) {
|
if end > len(utf16units) {
|
||||||
end = len(utf16units)
|
end = len(utf16units)
|
||||||
}
|
}
|
||||||
inner := utf16ToString(utf16units[e.Offset:end])
|
tags = append(tags, tag{pos: e.Offset, open: true, idx: i, text: open})
|
||||||
|
tags = append(tags, tag{pos: end, open: false, idx: i, text: close})
|
||||||
|
}
|
||||||
|
|
||||||
// Trim пробелов: выносим leading/trailing пробелы за маркеры
|
if len(tags) == 0 {
|
||||||
trimmed := strings.TrimRight(inner, " \t\n")
|
return text
|
||||||
trailingSpaces := inner[len(trimmed):]
|
}
|
||||||
trimmed2 := strings.TrimLeft(trimmed, " \t\n")
|
|
||||||
leadingSpaces := trimmed[:len(trimmed)-len(trimmed2)]
|
|
||||||
|
|
||||||
sb.WriteString(leadingSpaces)
|
sort.Slice(tags, func(i, j int) bool {
|
||||||
if trimmed2 != "" {
|
if tags[i].pos != tags[j].pos {
|
||||||
sb.WriteString(open)
|
return tags[i].pos < tags[j].pos
|
||||||
sb.WriteString(trimmed2)
|
|
||||||
sb.WriteString(close)
|
|
||||||
}
|
}
|
||||||
sb.WriteString(trailingSpaces)
|
// На одной позиции: close перед open (для смежных entities)
|
||||||
|
if tags[i].open != tags[j].open {
|
||||||
|
return !tags[i].open
|
||||||
|
}
|
||||||
|
// Среди open на одной позиции: по порядку entity
|
||||||
|
if tags[i].open {
|
||||||
|
return tags[i].idx < tags[j].idx
|
||||||
|
}
|
||||||
|
// Среди close на одной позиции: в обратном порядке (правильная вложенность)
|
||||||
|
return tags[i].idx > tags[j].idx
|
||||||
|
})
|
||||||
|
|
||||||
pos = end
|
var sb strings.Builder
|
||||||
|
tagIdx := 0
|
||||||
|
for i := 0; i <= len(utf16units); i++ {
|
||||||
|
for tagIdx < len(tags) && tags[tagIdx].pos == i {
|
||||||
|
sb.WriteString(tags[tagIdx].text)
|
||||||
|
tagIdx++
|
||||||
|
}
|
||||||
|
if i < len(utf16units) {
|
||||||
|
if utf16.IsSurrogate(rune(utf16units[i])) && i+1 < len(utf16units) {
|
||||||
|
r := utf16.DecodeRune(rune(utf16units[i]), rune(utf16units[i+1]))
|
||||||
|
sb.WriteRune(r)
|
||||||
|
i++
|
||||||
|
} else {
|
||||||
|
sb.WriteRune(rune(utf16units[i]))
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Остаток текста
|
|
||||||
if pos < len(utf16units) {
|
|
||||||
sb.WriteString(utf16ToString(utf16units[pos:]))
|
|
||||||
}
|
|
||||||
|
|
||||||
return sb.String()
|
return sb.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -90,11 +90,11 @@ func TestTgEntitiesToMarkdown_MultipleEntities(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestTgEntitiesToMarkdown_TrailingSpaces(t *testing.T) {
|
func TestTgEntitiesToMarkdown_TrailingSpaces(t *testing.T) {
|
||||||
// Entity covering "hello " (with trailing space) — space should be outside markers
|
// Entity covering "hello " (with trailing space) — markers placed at exact boundaries
|
||||||
got := tgEntitiesToMarkdown("hello world", []Entity{
|
got := tgEntitiesToMarkdown("hello world", []Entity{
|
||||||
{Type: "bold", Offset: 0, Length: 6}, // "hello "
|
{Type: "bold", Offset: 0, Length: 6}, // "hello "
|
||||||
})
|
})
|
||||||
want := "**hello** world"
|
want := "**hello **world"
|
||||||
if got != want {
|
if got != want {
|
||||||
t.Errorf("got %q, want %q", got, want)
|
t.Errorf("got %q, want %q", got, want)
|
||||||
}
|
}
|
||||||
|
|
@ -104,7 +104,31 @@ func TestTgEntitiesToMarkdown_LeadingSpaces(t *testing.T) {
|
||||||
got := tgEntitiesToMarkdown("a bold rest", []Entity{
|
got := tgEntitiesToMarkdown("a bold rest", []Entity{
|
||||||
{Type: "bold", Offset: 1, Length: 6}, // " bold"
|
{Type: "bold", Offset: 1, Length: 6}, // " bold"
|
||||||
})
|
})
|
||||||
want := "a **bold** rest"
|
want := "a** bold** rest"
|
||||||
|
if got != want {
|
||||||
|
t.Errorf("got %q, want %q", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTgEntitiesToMarkdown_OverlappingBoldItalic(t *testing.T) {
|
||||||
|
// Same text is both bold and italic — should produce nested markers, not duplicate text
|
||||||
|
got := tgEntitiesToMarkdown("Тест разметки", []Entity{
|
||||||
|
{Type: "bold", Offset: 5, Length: 8},
|
||||||
|
{Type: "italic", Offset: 5, Length: 8},
|
||||||
|
})
|
||||||
|
want := "Тест **_разметки_**"
|
||||||
|
if got != want {
|
||||||
|
t.Errorf("got %q, want %q", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTgEntitiesToMarkdown_OverlappingBoldItalicStrike(t *testing.T) {
|
||||||
|
got := tgEntitiesToMarkdown("Тест разметки", []Entity{
|
||||||
|
{Type: "bold", Offset: 5, Length: 8},
|
||||||
|
{Type: "italic", Offset: 5, Length: 8},
|
||||||
|
{Type: "strikethrough", Offset: 5, Length: 8},
|
||||||
|
})
|
||||||
|
want := "Тест **_~~разметки~~_**"
|
||||||
if got != want {
|
if got != want {
|
||||||
t.Errorf("got %q, want %q", got, want)
|
t.Errorf("got %q, want %q", got, want)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue