From bff61db431b1801f0c3ae90c36d9533179a825b9 Mon Sep 17 00:00:00 2001 From: Versun Date: Thu, 20 Jun 2024 03:53:45 +0000 Subject: [PATCH] feat: detect and set source language for translation --- core/tasks.py | 11 +++++++---- translator/models/base.py | 3 ++- translator/models/claude.py | 1 + translator/models/free_translators.py | 2 +- translator/models/gemini.py | 1 + utils/text_handler.py | 18 ++++++++++++++++++ 6 files changed, 30 insertions(+), 6 deletions(-) diff --git a/core/tasks.py b/core/tasks.py index 5f96f37..f169c56 100644 --- a/core/tasks.py +++ b/core/tasks.py @@ -250,11 +250,13 @@ def translate_feed( total_tokens = 0 translated_characters = 0 need_cache_objs = {} + source_language = "auto" try: for entry in translated_feed.entries[:max_posts]: title = entry.get("title") - + source_language = text_handler.detect_language(entry) + # Translate title if title and translate_engine and translate_title: cached = Translated_Content.is_translated( @@ -263,7 +265,7 @@ def translate_feed( translated_text = "" if not cached: results = translate_engine.translate( - title, target_language=target_language, text_type="title" + title, target_language=target_language, source_language=source_language, text_type="title" ) translated_text = results.get("text", title) total_tokens += results.get("tokens", 0) @@ -315,7 +317,7 @@ def translate_feed( if content: translated_summary, tokens, characters, need_cache = ( content_translate( - content, target_language, translate_engine, quality + content, target_language, translate_engine, quality, source_language=source_language ) ) total_tokens += tokens @@ -396,6 +398,7 @@ def content_translate( target_language: str, engine: TranslatorEngine, quality: bool = False, + source_language:str = "auto" ): total_tokens = 0 total_characters = 0 @@ -417,7 +420,7 @@ def content_translate( if not cached: results = engine.translate( - text, target_language=target_language, text_type="content" + text, target_language=target_language, source_language=source_language, text_type="content" ) total_tokens += results.get("tokens", 0) total_characters += len(text) diff --git a/translator/models/base.py b/translator/models/base.py index d0afd4b..78f6fd3 100644 --- a/translator/models/base.py +++ b/translator/models/base.py @@ -12,7 +12,7 @@ class TranslatorEngine(models.Model): valid = models.BooleanField(_("Valid"), null=True) is_ai = models.BooleanField(default=False, editable=False) - def translate(self, text: str, target_language: str) -> dict: + def translate(self, text: str, target_language: str, source_language:str="auto", **kwargs) -> dict: raise NotImplementedError( "subclasses of TranslatorEngine must provide a translate() method" ) @@ -142,6 +142,7 @@ def translate( system_prompt: str = None, user_prompt: str = None, text_type: str = "title", + **kwargs ) -> dict: logging.info(">>> Translate [%s]: %s", target_language, text) client = self._init() diff --git a/translator/models/claude.py b/translator/models/claude.py index 34d2631..b1e70bc 100644 --- a/translator/models/claude.py +++ b/translator/models/claude.py @@ -55,6 +55,7 @@ def translate( system_prompt: str = None, user_prompt: str = None, text_type: str = "title", + **kwargs ) -> dict: logging.info(">>> Claude Translate [%s]:", target_language) client = self._init() diff --git a/translator/models/free_translators.py b/translator/models/free_translators.py index a68a1a4..0fa24a7 100644 --- a/translator/models/free_translators.py +++ b/translator/models/free_translators.py @@ -34,7 +34,7 @@ def translate(self, text: str, target_language: str, source_language:str="auto", try: source_language = detect(text) if source_language == "auto" else source_language except: - source_language == "auto" + source_language = "auto" logging.warning("Cannot detect source language:%s", text) results = et.translate( diff --git a/translator/models/gemini.py b/translator/models/gemini.py index 0f4fa8e..63eb11d 100644 --- a/translator/models/gemini.py +++ b/translator/models/gemini.py @@ -61,6 +61,7 @@ def translate( system_prompt: str = None, user_prompt: str = None, text_type: str = "title", + **kwargs ) -> dict: logging.info(">>> Gemini Translate [%s]:", target_language) diff --git a/utils/text_handler.py b/utils/text_handler.py index 249c61e..cdf37f0 100644 --- a/utils/text_handler.py +++ b/utils/text_handler.py @@ -5,6 +5,24 @@ import tiktoken import html2text from translator.models import TranslatorEngine +from langdetect import detect + +def detect_language(entry): + title = entry.get("title") + original_content = entry.get("content") + content = ( + original_content[0].get("value") + if original_content + else entry.get("summary") + ) + text = title + " " + content + source_language = "auto" + try: + source_language = detect(text) + except Exception as e: + logging.warning("Cannot detect source language:%s,%s", e, text) + + return source_language def clean_content(content: str) -> str: