llm-translate/app/file_processor_html.py
illian64 17ade3687f
Some checks failed
Python application / build (push) Has been cancelled
Parallel processing
2025-10-25 11:32:09 +07:00

108 lines
5.5 KiB
Python

from typing import Iterator
from bs4 import BeautifulSoup, PageElement, Tag
from app import file_processor, parallel_process, dto
from app.app_core import AppCore
class FileProcessorHtml:
attribute_source = "data-src"
attribute_translate = "data-tr"
def __init__(self, core: AppCore, options: dict):
self.core = core
self.options = options
self.header_tags = options["header_tags"]
self.text_tags = options["text_tags"]
self.original_tag: str = options["text_format"]["original_tag"]
self.translate_tag: str = options["text_format"]["translate_tag"]
self.header_delimiter: str = options["text_format"]["header_delimiter"]
self.context_params = core.file_processing_params.context_params
def get_translate_element(self, soup: BeautifulSoup, child: PageElement, translate_txt: str) -> Tag:
translate_element = soup.new_tag(child.parent.name)
translate_element[self.attribute_translate] = "1"
if self.translate_tag == "":
translate_element.string = translate_txt
else:
additional_tag_element = soup.new_tag(self.translate_tag)
additional_tag_element.string = translate_txt
translate_element.append(additional_tag_element)
return translate_element
def get_original_element(self, soup: BeautifulSoup, child: PageElement, original_text: str) -> None | Tag:
if self.original_tag == "":
return None
else:
original_element = soup.new_tag(child.parent.name)
additional_tag_element = soup.new_tag(self.original_tag)
additional_tag_element.string = original_text
original_element.append(additional_tag_element)
return original_element
def process_html(self, req: dto.ProcessingFileDirReq, soup: BeautifulSoup, body_tag, parallel: bool,
gpu_count_for_parallel = None) -> None:
translate_only_first_paragraphs: int = self.options.get("translate_only_first_paragraphs", 0)
children: Iterator[PageElement] = soup.find(body_tag).descendants if body_tag else soup.descendants
translated_paragraphs = 0
all_original_text_items: list[str] = []
translate_params: list[dto.TranslateCommonRequest] = list()
for child in children:
if (child and child.text and child.parent and child.parent.get(self.attribute_source) is None
and child.parent.get(self.attribute_translate) is None):
child_tag = child.parent.name
if child_tag and child.parent.text and (child_tag in self.text_tags or child_tag in self.header_tags):
# get contents - for example <p><b>1</b>2<i>3</i><p> - 3 items. 1, 3 - tags, 2 - simple string
# contents = child.parent.contents - for translate with save format within paragraph
child.parent[self.attribute_source] = "1"
original_text = child.parent.text
# generate context before add text in all_original_text_items
context = file_processor.get_context(items_to_context=all_original_text_items,
params=self.context_params, translate_text=original_text)
all_original_text_items.append(original_text)
translate_req = req.translate_req(text=original_text, context=context)
# if parallel - only fill params list, after that will be start async translate
if parallel:
translate_params.append(translate_req)
else:
translate_txt = self.core.translate(translate_req).result
translated_paragraphs = translated_paragraphs + 1
if 0 < translate_only_first_paragraphs <= translated_paragraphs:
break
if child_tag in self.text_tags:
translate_element = self.get_translate_element(soup, child, translate_txt)
if req.preserve_original_text:
child.parent.insert_after(translate_element)
original_element = self.get_original_element(soup, child, original_text)
if original_element:
child.replaceWith(original_element)
else:
child.replaceWith(translate_element)
elif child_tag in self.header_tags:
if req.preserve_original_text:
child.parent.string = f'{original_text}{self.header_delimiter}{translate_txt}'
else:
child.parent.string = translate_txt
if parallel:
parallel_process.start_parallel_processing(gpu_count_for_parallel, self.core, translate_params)
def process(self, req: dto.ProcessingFileDirReq, soup: BeautifulSoup, body_tag: str = None) -> None:
gpu_count_for_parallel = parallel_process.translate_plugin_support_parallel_gpu_count(self.core, req.translator_plugin)
if gpu_count_for_parallel is not None:
# First pre-pass - translate without any actions, for fill cache. Next pass get translated text from cache.
self.process_html(req, soup, body_tag, True, gpu_count_for_parallel)
self.process_html(req, soup, body_tag, False)