fix encoding errors on content creation

This commit is contained in:
LUIS NOVO 2024-10-25 08:16:00 -03:00
parent 375b7dc56b
commit 4f30f59382
2 changed files with 92 additions and 5 deletions

View file

@ -1,4 +1,5 @@
import re
import unicodedata
import fitz # type: ignore
import magic
@ -66,13 +67,100 @@ def file_type(state: SourceState):
return return_dict
def clean_pdf_text(text):
"""
Clean text extracted from PDFs with enhanced space handling.
Args:
text (str): The raw text extracted from a PDF
Returns:
str: Cleaned text with minimal necessary spacing
"""
if not text:
return text
# Step 1: Normalize Unicode characters
text = unicodedata.normalize("NFKC", text)
# Step 2: Replace common PDF artifacts
replacements = {
# Common ligatures
"": "fi",
"": "fl",
"": "ff",
"": "ffi",
"": "ffl",
# Quotation marks and apostrophes
""": "'", """: "'",
'"': '"',
"": "'",
"": ",",
"": '"',
# Dashes and hyphens
"": "-",
"": "-",
"": "-",
"": "-",
# Other common replacements
"": "...",
"": "*",
"°": " degrees ",
"¹": "1",
"²": "2",
"³": "3",
"©": "(c)",
"®": "(R)",
"": "(TM)",
}
for old, new in replacements.items():
text = text.replace(old, new)
# Step 3: Advanced space cleaning
# Remove control characters while preserving essential whitespace
text = "".join(
char for char in text if unicodedata.category(char)[0] != "C" or char in "\n\t "
)
# Step 4: Enhanced space cleaning
text = re.sub(r"[ \t]+", " ", text) # Consolidate horizontal whitespace
text = re.sub(r" +\n", "\n", text) # Remove spaces before newlines
text = re.sub(r"\n +", "\n", text) # Remove spaces after newlines
text = re.sub(r"\n\t+", "\n", text) # Remove tabs at start of lines
text = re.sub(r"\t+\n", "\n", text) # Remove tabs at end of lines
text = re.sub(r"\t+", " ", text) # Replace tabs with single space
# Step 5: Remove empty lines while preserving paragraph structure
text = re.sub(r"\n{3,}", "\n\n", text) # Max two consecutive newlines
text = re.sub(r"^\s+", "", text) # Remove leading whitespace
text = re.sub(r"\s+$", "", text) # Remove trailing whitespace
# Step 6: Clean up around punctuation
text = re.sub(r"\s+([.,;:!?)])", r"\1", text) # Remove spaces before punctuation
text = re.sub(r"(\()\s+", r"\1", text) # Remove spaces after opening parenthesis
text = re.sub(
r"\s+([.,])\s+", r"\1 ", text
) # Ensure single space after periods and commas
# Step 7: Remove zero-width and invisible characters
text = re.sub(r"[\u200b\u200c\u200d\ufeff\u200e\u200f]", "", text)
# Step 8: Fix hyphenation and line breaks
text = re.sub(
r"(?<=\w)-\s*\n\s*(?=\w)", "", text
) # Remove hyphenation at line breaks
return text.strip()
def _extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
doc.close()
return text
normalized_text = clean_pdf_text(text)
return normalized_text
def extract_pdf(state: SourceState):