From 795fd3bb9d987615f02e06e202a052533aeaf231 Mon Sep 17 00:00:00 2001 From: LUIS NOVO Date: Wed, 23 Oct 2024 14:13:22 -0300 Subject: [PATCH] improve surreal escaping --- open_notebook/utils.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/open_notebook/utils.py b/open_notebook/utils.py index 0a4c555..27ae092 100644 --- a/open_notebook/utils.py +++ b/open_notebook/utils.py @@ -1,5 +1,5 @@ import re -import string +import unicodedata from langchain_text_splitters import CharacterTextSplitter from openai import OpenAI @@ -78,7 +78,13 @@ def remove_non_ascii(text): def remove_non_printable(text): - return "".join(filter(lambda x: x in string.printable, text)) + # Remove caracteres de controle, exceto quebras de linha e tabulações + text = "".join( + char for char in text if unicodedata.category(char)[0] != "C" or char in "\n\t" + ) + # Manter letras (incluindo acentuadas), números, espaços, quebras de linha, tabulações e pontuação básica + allowed = r"a-zA-Z0-9\s.,!?\-\n\t" + return re.sub(f"[^{allowed}]", "", text, flags=re.UNICODE) def surreal_clean(text): @@ -91,7 +97,7 @@ def surreal_clean(text): Returns: str: The cleaned text with adjusted formatting. """ - text = remove_non_printable(remove_non_ascii(text)) + text = remove_non_printable(text) # Add space after colon if it's before the first space first_space_index = text.find(" ")