llm-translate/test/test_text_splitter.py

import unittest

from app import text_splitter
from app.dto import Part
from app.params import TextSplitParams

s1 = "Text one."
s2 = "Text two."
s3 = "Text three."
s4 = "Text four."
s5 = "Text five."
s6 = "Text six."
s7 = "Text seven."
s8 = "Text eight."
s9 = "Text nine."

split_txt1 = "Some text. Mr. John Johnson Jr. was born in the U.S.A but earned his Ph.D. in Israel before joining Nike Inc. as an engineer. He also worked at craigslist.org as a business analyst. Some text one. Some text two."
split_txt2 = "Some sentence. Mr. Holmes... This is a new sentence! And This is another one.. Hi "


def get_params():
    return TextSplitParams(
        split_expected_length=20, sentence_splitter="def",
        split_by_paragraphs_only=False, split_by_paragraphs_and_length=False,
        split_by_sentences_only=False, split_by_sentences_and_length=False)


def get_text(*sentences):
    return " ".join(sentences).replace("\n ", "\n")


class ReqProcessorTest(unittest.TestCase):
    def test_split_by_sentences_blingfire(self):
        params = get_params()
        params.sentence_splitter = "blingfire"

        split = text_splitter.split_by_sentences(split_txt1, params)
        exp = ['Some text. Mr. John Johnson Jr. was born in the U.S.A but earned his Ph.D. in Israel before joining Nike Inc. as an engineer.',
               'He also worked at craigslist.org as a business analyst.',
               'Some text one.',
               'Some text two.']
        self.assertEqual(exp, split)

        split = text_splitter.split_by_sentences(split_txt2, params)
        exp = ['Some sentence.',
               'Mr.',
               'Holmes...',
               'This is a new sentence!',
               'And This is another one..',
               'Hi ']
        self.assertEqual(exp, split)

    def test_split_by_sentences_pysbd(self):
        params = get_params()

        split = text_splitter.split_by_sentences(split_txt1, params)
        exp = ['Some text.',
               'Mr. John Johnson Jr. was born in the U.S.A but earned his Ph.D. in Israel before joining Nike Inc. as an engineer.',
               'He also worked at craigslist.org as a business analyst.',
               'Some text one.',
               'Some text two.']
        self.assertEqual(exp, split)

        split = text_splitter.split_by_sentences(split_txt2, params)
        exp = ['Some sentence.',
               'Mr. Holmes...',
               'This is a new sentence!',
               'And This is another one.',
               '.',
               'Hi']
        self.assertEqual(exp, split)

    def test_no_split(self):
        params = get_params()

        text = get_text(s1, s2, s3, s4, s5)
        split = text_splitter.split_text(text, params)
        exp = [Part(get_text(s1, s2, s3, s4, s5), False)]

        self.assertEqual(exp, split)

    def test_split_by_few_paragraphs_and_length__long_text_with_one_paragraph(self):
        params = get_params()
        params.split_by_paragraphs_and_length = True

        text = get_text(s1, s2, s3, s4, s5)
        split = text_splitter.split_text(text, params)
        exp = [Part(get_text(s1, s2, s3, s4, s5), True)]

        self.assertEqual(exp, split)

    def test_split_by_few_paragraphs_and_length__short_text_with_one_paragraph(self):
        params = get_params()
        params.split_by_paragraphs_and_length = True

        text = get_text(s1, s2)
        split = text_splitter.split_text(text, params)
        exp = [Part(get_text(s1, s2), True)]

        self.assertEqual(exp, split)

    def test_split_by_few_paragraphs_and_length__text_with_few_paragraphs_01(self):
        params = get_params()
        params.split_by_paragraphs_and_length = True
        # text with few paragraphs
        text = get_text(s1 + "\n\n", s2 + "\n", s3, s4 + "\n", s5, s6, s7, s8 + "\n", s9 + "\n", s1)
        split = text_splitter.split_text(text, params)
        exp = [Part(get_text(s1 + "\n\n", s2), True),
               Part(get_text(s3, s4), True),
               Part(get_text(s5, s6, s7, s8), True),
               Part(get_text(s9 + "\n", s1), True),]

        self.assertEqual(exp, split)

    def test_split_by_few_paragraphs_and_length__text_with_few_paragraphs_02(self):
        params = get_params()
        params.split_by_paragraphs_and_length = True

        text = get_text(s1 + "\n", s2 + "\n", s3 + "\n", s4 + "\n", s5)
        split = text_splitter.split_text(text, params)
        exp = [Part(get_text(s1 + "\n", s2), True),
               Part(get_text(s3), True),
               Part(get_text(s4 + "\n", s5), True),]

        self.assertEqual(exp, split)

    def test_split_by_paragraphs_only_01(self):
        params = get_params()
        params.split_by_paragraphs_only = True
        # text with few paragraphs
        text = get_text(s1 + "\n\n", s2 + "\n", s3, s4 + "\n", s5, s6)
        split = text_splitter.split_text(text, params)
        exp = [Part(get_text(s1), True),
               Part(get_text(""), True),
               Part(get_text(s2), True),
               Part(get_text(s3, s4), True),
               Part(get_text(s5, s6), False)]

        self.assertEqual(exp, split)

    def test_split_by_paragraphs_only_02(self):
        params = get_params()
        params.split_by_paragraphs_only = True
        # text with few paragraphs
        split = text_splitter.split_text(s1, params)
        exp = [Part(get_text(s1), True),
               Part(get_text(""), True),
               Part(get_text(s2), True),
               Part(get_text(s3, s4), True),
               Part(get_text(s5, s6), True)]

        self.assertEqual([Part(s1, False)], split)

    def test_split_by_few_sentences_and_length_01(self):
        params = get_params()
        params.split_by_sentences_and_length = True
        params.split_expected_length = 25
        # text with few paragraphs
        text = get_text(s1 + "\n\n", s2 + "\n", s3, s4 + "\n", s5, s6, s7, s8 + "\n", s9 + "\n", s1)
        split = text_splitter.split_text(text, params)
        exp = [Part(get_text(s1 + "\n\n", s2), True),
               Part(get_text(s3, s4), True),
               Part(get_text(s5, s6), False),
               Part(get_text(s7, s8), True),
               Part(get_text(s9 + "\n", s1), False),]

        self.assertEqual(exp, split)

    def test_split_by_few_sentences_and_length_02(self):
        params = get_params()
        params.split_by_sentences_and_length = True
        params.split_expected_length = 40
        # text with few paragraphs
        text = get_text(s1 + "\n\n", s2 + "\n", s3, s4 + "\n", s5, s6, s7, s8 + "\n", s9 + "\n", s1)
        split = text_splitter.split_text(text, params)
        exp = [Part(get_text(s1 + "\n\n", s2 + "\n", s3), False),
               Part(get_text(s4 + "\n", s5, s6), False),
               Part(get_text(s7, s8 + "\n", s9), True),
               Part(get_text(s1), False),]

        self.assertEqual(exp, split)

    def test_split_by_few_sentences_and_length_03(self):
        params = get_params()
        params.split_by_sentences_and_length = True
        split = text_splitter.split_text(s1, params)

        self.assertEqual([Part(s1, False)], split)

    def test_split_by_sentences_only_01(self):
        params = get_params()
        params.split_by_sentences_only = True
        split = text_splitter.split_text(s1, params)
        self.assertEqual([Part(s1, False)], split)

    def test_split_by_sentences_only_02(self):
        params = get_params()
        params.split_by_sentences_only = True
        # text with few paragraphs
        text = get_text(s1 + "\n\n", s2 + "\n", s3, s4 + "\n", s5, s6, s7, s8 + "\n", s9 + "\n", s1)
        split = text_splitter.split_text(text, params)
        exp = [Part(s1, True),
               Part("", True),
               Part(s2, True),
               Part(s3, False),
               Part(s4, True),
               Part(s5, False),
               Part(s6, False),
               Part(s7, False),
               Part(s8, True),
               Part(s9, True),
               Part(s1, False),]

        self.assertEqual(exp, split)