Files processing - books

* book translate

* files processing

* files processing

* files processing

* files processing

---------

Co-authored-by: APodoinikov <APodoynikov@detmir.ru>
This commit is contained in:
illian64 2025-09-04 11:09:29 +07:00 committed by GitHub
parent e9e0e647f7
commit 0a70da3b98
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
39 changed files with 1737 additions and 493 deletions

View file

@ -6,11 +6,11 @@ from ctranslate2 import Translator
from tqdm import tqdm
from transformers import PreTrainedTokenizerBase
from app import cuda, struct
from app import cuda, params
from app.app_core import AppCore
from app.struct import TranslateStruct, tp
from app.dto import TranslatePluginInitInfo, TranslateStruct
modname = os.path.basename(__file__)[:-3]
plugin_name = os.path.basename(__file__)[:-3]
model: Translator
tokenizer: PreTrainedTokenizerBase
@ -43,26 +43,26 @@ def start(core: AppCore):
def start_with_options(core: AppCore, manifest:dict):
struct.read_plugin_params(manifest)
params.read_plugin_translate_params(manifest)
return manifest
def init(core:AppCore):
options = core.plugin_options(modname)
def init(core:AppCore) -> TranslatePluginInitInfo:
options = core.plugin_options(plugin_name)
global model
global tokenizer
model = ctranslate2.Translator(options["model"],
model = ctranslate2.Translator(options["model"], compute_type=options["compute_type"],
device=cuda.get_device(options), device_index=options["cuda_device_index"])
tokenizer = transformers.AutoTokenizer.from_pretrained(options["tokenizer"])
return modname
return TranslatePluginInitInfo(plugin_name=plugin_name, model_name=f'{options["model"]}__{options["compute_type"]}')
def translate(core: AppCore, ts: TranslateStruct):
options = core.plugin_options(modname)
options = core.plugin_options(plugin_name)
# # implementation 1: one part - one batch
# for part in ts.parts:
@ -78,7 +78,7 @@ def translate(core: AppCore, ts: TranslateStruct):
# implementation 2: all parts - one batch. It's faster, but depends on amount of batches.
tokens_list = []
for part in tqdm(ts.parts, unit=tp.unit, ascii=tp.ascii, desc=tp.desc):
for part in tqdm(ts.parts, unit=params.tp.unit, ascii=params.tp.ascii, desc=params.tp.desc):
if part.need_to_translate():
input_text = "<2" + ts.req.to_lang + ">" + part.text
tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(input_text))