mirror of
https://github.com/lfnovo/open-notebook.git
synced 2026-05-05 15:24:35 +00:00
fix encoding errors on content creation
This commit is contained in:
parent
375b7dc56b
commit
4f30f59382
2 changed files with 92 additions and 5 deletions
|
|
@ -1,4 +1,5 @@
|
|||
import re
|
||||
import unicodedata
|
||||
|
||||
import fitz # type: ignore
|
||||
import magic
|
||||
|
|
@ -66,13 +67,100 @@ def file_type(state: SourceState):
|
|||
return return_dict
|
||||
|
||||
|
||||
def clean_pdf_text(text):
|
||||
"""
|
||||
Clean text extracted from PDFs with enhanced space handling.
|
||||
|
||||
Args:
|
||||
text (str): The raw text extracted from a PDF
|
||||
Returns:
|
||||
str: Cleaned text with minimal necessary spacing
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
# Step 1: Normalize Unicode characters
|
||||
text = unicodedata.normalize("NFKC", text)
|
||||
|
||||
# Step 2: Replace common PDF artifacts
|
||||
replacements = {
|
||||
# Common ligatures
|
||||
"fi": "fi",
|
||||
"fl": "fl",
|
||||
"ff": "ff",
|
||||
"ffi": "ffi",
|
||||
"ffl": "ffl",
|
||||
# Quotation marks and apostrophes
|
||||
""": "'", """: "'",
|
||||
'"': '"',
|
||||
"′": "'",
|
||||
"‚": ",",
|
||||
"„": '"',
|
||||
# Dashes and hyphens
|
||||
"‒": "-",
|
||||
"–": "-",
|
||||
"—": "-",
|
||||
"―": "-",
|
||||
# Other common replacements
|
||||
"…": "...",
|
||||
"•": "*",
|
||||
"°": " degrees ",
|
||||
"¹": "1",
|
||||
"²": "2",
|
||||
"³": "3",
|
||||
"©": "(c)",
|
||||
"®": "(R)",
|
||||
"™": "(TM)",
|
||||
}
|
||||
for old, new in replacements.items():
|
||||
text = text.replace(old, new)
|
||||
|
||||
# Step 3: Advanced space cleaning
|
||||
# Remove control characters while preserving essential whitespace
|
||||
text = "".join(
|
||||
char for char in text if unicodedata.category(char)[0] != "C" or char in "\n\t "
|
||||
)
|
||||
|
||||
# Step 4: Enhanced space cleaning
|
||||
text = re.sub(r"[ \t]+", " ", text) # Consolidate horizontal whitespace
|
||||
text = re.sub(r" +\n", "\n", text) # Remove spaces before newlines
|
||||
text = re.sub(r"\n +", "\n", text) # Remove spaces after newlines
|
||||
text = re.sub(r"\n\t+", "\n", text) # Remove tabs at start of lines
|
||||
text = re.sub(r"\t+\n", "\n", text) # Remove tabs at end of lines
|
||||
text = re.sub(r"\t+", " ", text) # Replace tabs with single space
|
||||
|
||||
# Step 5: Remove empty lines while preserving paragraph structure
|
||||
text = re.sub(r"\n{3,}", "\n\n", text) # Max two consecutive newlines
|
||||
text = re.sub(r"^\s+", "", text) # Remove leading whitespace
|
||||
text = re.sub(r"\s+$", "", text) # Remove trailing whitespace
|
||||
|
||||
# Step 6: Clean up around punctuation
|
||||
text = re.sub(r"\s+([.,;:!?)])", r"\1", text) # Remove spaces before punctuation
|
||||
text = re.sub(r"(\()\s+", r"\1", text) # Remove spaces after opening parenthesis
|
||||
text = re.sub(
|
||||
r"\s+([.,])\s+", r"\1 ", text
|
||||
) # Ensure single space after periods and commas
|
||||
|
||||
# Step 7: Remove zero-width and invisible characters
|
||||
text = re.sub(r"[\u200b\u200c\u200d\ufeff\u200e\u200f]", "", text)
|
||||
|
||||
# Step 8: Fix hyphenation and line breaks
|
||||
text = re.sub(
|
||||
r"(?<=\w)-\s*\n\s*(?=\w)", "", text
|
||||
) # Remove hyphenation at line breaks
|
||||
|
||||
return text.strip()
|
||||
|
||||
|
||||
def _extract_text_from_pdf(pdf_path):
|
||||
doc = fitz.open(pdf_path)
|
||||
text = ""
|
||||
for page in doc:
|
||||
text += page.get_text()
|
||||
doc.close()
|
||||
return text
|
||||
|
||||
normalized_text = clean_pdf_text(text)
|
||||
return normalized_text
|
||||
|
||||
|
||||
def extract_pdf(state: SourceState):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue