Skip to content

Translator Class

Translator

This class defines and stores a language model (such as MarianMT) for the translation task, from source_language to target_language. It also provides functions to perform full translations efficiently from extracted subtitles.

Warning

The translation model will be download from HugginFace servers and cached for a faster load next time. For each (source_language, target_language) pair, there is a distinct model.

Parameters:

Name Type Description Default
source_language AvailableLanguages

Language of the source subtitles.

required
target_language AvailableLanguages

Target language.

required
Source code in subtitles_translator/translator.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
class Translator:
    """This class defines and stores a language model (such as MarianMT) for the translation
    task, from source_language to target_language. It also provides functions to perform full
    translations efficiently from extracted subtitles.

    Warning:
        The translation model will be download from HugginFace servers and cached for a faster load next time.
        For each (source_language, target_language) pair, there is a distinct model.

    Args:
        source_language (AvailableLanguages, optional): Language of the source subtitles.
        target_language (AvailableLanguages, optional): Target language.

    """

    def __init__(
        self,
        source_language: AvailableLanguages,
        target_language: AvailableLanguages,
    ) -> None:
        self.source_language = source_language
        self.target_language = target_language

        model_name = f"Helsinki-NLP/opus-mt-{source_language.value}-{target_language.value}"

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    def translate(self, input_text: str) -> str:
        """Translate a text input using the model.

        Args:
            input_text (str): Text to be translated (usually, a single sentence)

        Returns:
            str: Translated text.

        """

        batch = self.tokenizer([input_text], return_tensors="pt")
        generated_ids = self.model.generate(**batch)
        translated_text = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

        return translated_text  # type: ignore  # noqa: PGH003

    def translate_subtitles(self, subtitles: Subtitles) -> None:
        """Use given translator to perform translation using dictionary of aggregated
        subtitles lines. Each translated line replaces the original one in the full_text_line
        list created at the beginning of the process.

        Args:
            subtitles (Subtitles object): Object of the Subtitles class.

        """
        # here the iterable is aggregated_dico_lines.items(), we use progressBar
        for full_line_text, lines_ranges in self.progressBar(
            iterable=subtitles.aggregated_dico_lines.items(), prefix="Progress:", suffix="Complete", length=50
        ):
            line_text = full_line_text.replace("\n", "")
            translated_text = self.translate(line_text)
            for line_pos in lines_ranges:
                # case when the current screen subtitles stands on 1 line
                if len(line_pos) == 1:
                    subtitles.full_text_lines[line_pos[0]] = translated_text
                # case when we're on two lines subtitles
                # we try to "rebuild" two-line subtitles
                if len(line_pos) == 2:
                    split_text = translated_text.split(" ")
                    mid_line = len(split_text) // 2
                    first_line = " ".join(split_text[:mid_line])
                    second_line = " ".join(split_text[mid_line:])
                    subtitles.full_text_lines[line_pos[0]] = first_line
                    subtitles.full_text_lines[line_pos[1]] = second_line

    @staticmethod
    def progressBar(
        iterable: Iterable[tuple[str, list]],
        prefix: str = "",
        suffix: str = "",
        decimals: int = 1,
        length: int = 100,
        fill: str = "█",
        print_end: str = "\r",
    ) -> Generator[tuple[str, list], None, None]:
        """
        Call in a loop to create terminal progress bar.
        Source : https://stackoverflow.com/questions/3173320/text-progress-bar-in-terminal-with-block-characters/13685020

        Args:
            iterable (Iterable): Iterable object
            prefix (str, optional): Prefix string
            suffix (str, optional): Suffix string
            decimals (int, optional): Positive number of decimals in percent complete
            length (int, optional) : Character length of bar
            fill (str, optional): Bar fill character (Str)
            print_end (str, optional): End character

        """
        total = len(iterable)  # type: ignore # type checks fails here (len of iterable ?)  # noqa: PGH003
        # Progress Bar Printing Function
        def printProgressBar(iteration: int) -> None:
            percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
            filledLength = int(length * iteration // total)
            bar = fill * filledLength + "-" * (length - filledLength)
            print(f"\r{prefix} |{bar}| {percent}% {suffix}", end=print_end)

        # Initial Call
        printProgressBar(0)
        # Update Progress Bar
        for i, item in enumerate(iterable):
            yield item
            printProgressBar(i + 1)
        # Print New Line on Complete
        print()

progressBar(iterable, prefix='', suffix='', decimals=1, length=100, fill='█', print_end='\r') staticmethod

Call in a loop to create terminal progress bar. Source : https://stackoverflow.com/questions/3173320/text-progress-bar-in-terminal-with-block-characters/13685020

Parameters:

Name Type Description Default
iterable Iterable

Iterable object

required
prefix str

Prefix string

''
suffix str

Suffix string

''
decimals int

Positive number of decimals in percent complete

1
length int, optional)

Character length of bar

100
fill str

Bar fill character (Str)

'█'
print_end str

End character

'\r'
Source code in subtitles_translator/translator.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
@staticmethod
def progressBar(
    iterable: Iterable[tuple[str, list]],
    prefix: str = "",
    suffix: str = "",
    decimals: int = 1,
    length: int = 100,
    fill: str = "█",
    print_end: str = "\r",
) -> Generator[tuple[str, list], None, None]:
    """
    Call in a loop to create terminal progress bar.
    Source : https://stackoverflow.com/questions/3173320/text-progress-bar-in-terminal-with-block-characters/13685020

    Args:
        iterable (Iterable): Iterable object
        prefix (str, optional): Prefix string
        suffix (str, optional): Suffix string
        decimals (int, optional): Positive number of decimals in percent complete
        length (int, optional) : Character length of bar
        fill (str, optional): Bar fill character (Str)
        print_end (str, optional): End character

    """
    total = len(iterable)  # type: ignore # type checks fails here (len of iterable ?)  # noqa: PGH003
    # Progress Bar Printing Function
    def printProgressBar(iteration: int) -> None:
        percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
        filledLength = int(length * iteration // total)
        bar = fill * filledLength + "-" * (length - filledLength)
        print(f"\r{prefix} |{bar}| {percent}% {suffix}", end=print_end)

    # Initial Call
    printProgressBar(0)
    # Update Progress Bar
    for i, item in enumerate(iterable):
        yield item
        printProgressBar(i + 1)
    # Print New Line on Complete
    print()

translate(input_text)

Translate a text input using the model.

Parameters:

Name Type Description Default
input_text str

Text to be translated (usually, a single sentence)

required

Returns:

Name Type Description
str str

Translated text.

Source code in subtitles_translator/translator.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def translate(self, input_text: str) -> str:
    """Translate a text input using the model.

    Args:
        input_text (str): Text to be translated (usually, a single sentence)

    Returns:
        str: Translated text.

    """

    batch = self.tokenizer([input_text], return_tensors="pt")
    generated_ids = self.model.generate(**batch)
    translated_text = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return translated_text  # type: ignore  # noqa: PGH003

translate_subtitles(subtitles)

Use given translator to perform translation using dictionary of aggregated subtitles lines. Each translated line replaces the original one in the full_text_line list created at the beginning of the process.

Parameters:

Name Type Description Default
subtitles Subtitles object

Object of the Subtitles class.

required
Source code in subtitles_translator/translator.py
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def translate_subtitles(self, subtitles: Subtitles) -> None:
    """Use given translator to perform translation using dictionary of aggregated
    subtitles lines. Each translated line replaces the original one in the full_text_line
    list created at the beginning of the process.

    Args:
        subtitles (Subtitles object): Object of the Subtitles class.

    """
    # here the iterable is aggregated_dico_lines.items(), we use progressBar
    for full_line_text, lines_ranges in self.progressBar(
        iterable=subtitles.aggregated_dico_lines.items(), prefix="Progress:", suffix="Complete", length=50
    ):
        line_text = full_line_text.replace("\n", "")
        translated_text = self.translate(line_text)
        for line_pos in lines_ranges:
            # case when the current screen subtitles stands on 1 line
            if len(line_pos) == 1:
                subtitles.full_text_lines[line_pos[0]] = translated_text
            # case when we're on two lines subtitles
            # we try to "rebuild" two-line subtitles
            if len(line_pos) == 2:
                split_text = translated_text.split(" ")
                mid_line = len(split_text) // 2
                first_line = " ".join(split_text[:mid_line])
                second_line = " ".join(split_text[mid_line:])
                subtitles.full_text_lines[line_pos[0]] = first_line
                subtitles.full_text_lines[line_pos[1]] = second_line