import re import unicodedata from langchain_text_splitters import CharacterTextSplitter from openai import OpenAI client = OpenAI() def split_text(txt: str, chunk=1000, overlap=0, separator=" "): """ Split the input text into chunks. Args: txt (str): The input text to be split. chunk (int): The size of each chunk. Default is 1000. overlap (int): The number of characters to overlap between chunks. Default is 0. separator (str): The separator to use when splitting the text. Default is " ". Returns: list: A list of text chunks. """ text_splitter = CharacterTextSplitter( chunk_size=chunk, chunk_overlap=overlap, separator=separator ) return text_splitter.split_text(txt) def token_count(input_string): """ Count the number of tokens in the input string using the 'o200k_base' encoding. Args: input_string (str): The input string to count tokens for. Returns: int: The number of tokens in the input string. """ import tiktoken encoding = tiktoken.get_encoding("o200k_base") tokens = encoding.encode(input_string) token_count = len(tokens) return token_count def token_cost(token_count, cost_per_million=0.150): """ Calculate the cost of tokens based on the token count and cost per million tokens. Args: token_count (int): The number of tokens. cost_per_million (float): The cost per million tokens. Default is 0.150. Returns: float: The calculated cost for the given token count. """ return cost_per_million * (token_count / 1_000_000) def get_embedding(text, model="text-embedding-3-small"): """ Get the embedding for the input text using the specified model. Args: text (str): The input text to get the embedding for. model (str): The name of the embedding model to use. Default is "text-embedding-3-small". Returns: list: The embedding vector for the input text. """ text = text.replace("\n", " ") return client.embeddings.create(input=[text], model=model).data[0].embedding def remove_non_ascii(text): return re.sub(r"[^\x00-\x7F]+", "", text) def remove_non_printable(text): # Remove caracteres de controle, exceto quebras de linha e tabulações text = "".join( char for char in text if unicodedata.category(char)[0] != "C" or char in "\n\t" ) # Manter letras (incluindo acentuadas), números, espaços, quebras de linha, tabulações e pontuação básica allowed = r"a-zA-Z0-9\s.,!?\-\n\t" return re.sub(f"[^{allowed}]", "", text, flags=re.UNICODE) def surreal_clean(text): """ Clean the input text by removing non-ASCII and non-printable characters, and adjusting colon placement for SurrealDB compatibility. Args: text (str): The input text to clean. Returns: str: The cleaned text with adjusted formatting. """ text = remove_non_printable(text) # Add space after colon if it's before the first space first_space_index = text.find(" ") colon_index = text.find(":") if colon_index != -1 and ( first_space_index == -1 or colon_index < first_space_index ): text = text.replace(":", "\:", 1) return text