Skip to content

Commit

Permalink
feat: detect and set source language for translation
Browse files Browse the repository at this point in the history
  • Loading branch information
versun committed Jun 20, 2024
1 parent 05fc049 commit bff61db
Show file tree
Hide file tree
Showing 6 changed files with 30 additions and 6 deletions.
11 changes: 7 additions & 4 deletions core/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,11 +250,13 @@ def translate_feed(
total_tokens = 0
translated_characters = 0
need_cache_objs = {}
source_language = "auto"

try:
for entry in translated_feed.entries[:max_posts]:
title = entry.get("title")

source_language = text_handler.detect_language(entry)

# Translate title
if title and translate_engine and translate_title:
cached = Translated_Content.is_translated(
Expand All @@ -263,7 +265,7 @@ def translate_feed(
translated_text = ""
if not cached:
results = translate_engine.translate(
title, target_language=target_language, text_type="title"
title, target_language=target_language, source_language=source_language, text_type="title"
)
translated_text = results.get("text", title)
total_tokens += results.get("tokens", 0)
Expand Down Expand Up @@ -315,7 +317,7 @@ def translate_feed(
if content:
translated_summary, tokens, characters, need_cache = (
content_translate(
content, target_language, translate_engine, quality
content, target_language, translate_engine, quality, source_language=source_language
)
)
total_tokens += tokens
Expand Down Expand Up @@ -396,6 +398,7 @@ def content_translate(
target_language: str,
engine: TranslatorEngine,
quality: bool = False,
source_language:str = "auto"
):
total_tokens = 0
total_characters = 0
Expand All @@ -417,7 +420,7 @@ def content_translate(

if not cached:
results = engine.translate(
text, target_language=target_language, text_type="content"
text, target_language=target_language, source_language=source_language, text_type="content"
)
total_tokens += results.get("tokens", 0)
total_characters += len(text)
Expand Down
3 changes: 2 additions & 1 deletion translator/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class TranslatorEngine(models.Model):
valid = models.BooleanField(_("Valid"), null=True)
is_ai = models.BooleanField(default=False, editable=False)

def translate(self, text: str, target_language: str) -> dict:
def translate(self, text: str, target_language: str, source_language:str="auto", **kwargs) -> dict:
raise NotImplementedError(
"subclasses of TranslatorEngine must provide a translate() method"
)
Expand Down Expand Up @@ -142,6 +142,7 @@ def translate(
system_prompt: str = None,
user_prompt: str = None,
text_type: str = "title",
**kwargs
) -> dict:
logging.info(">>> Translate [%s]: %s", target_language, text)
client = self._init()
Expand Down
1 change: 1 addition & 0 deletions translator/models/claude.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def translate(
system_prompt: str = None,
user_prompt: str = None,
text_type: str = "title",
**kwargs
) -> dict:
logging.info(">>> Claude Translate [%s]:", target_language)
client = self._init()
Expand Down
2 changes: 1 addition & 1 deletion translator/models/free_translators.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def translate(self, text: str, target_language: str, source_language:str="auto",
try:
source_language = detect(text) if source_language == "auto" else source_language
except:
source_language == "auto"
source_language = "auto"
logging.warning("Cannot detect source language:%s", text)

results = et.translate(
Expand Down
1 change: 1 addition & 0 deletions translator/models/gemini.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def translate(
system_prompt: str = None,
user_prompt: str = None,
text_type: str = "title",
**kwargs
) -> dict:
logging.info(">>> Gemini Translate [%s]:", target_language)

Expand Down
18 changes: 18 additions & 0 deletions utils/text_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,24 @@
import tiktoken
import html2text
from translator.models import TranslatorEngine
from langdetect import detect

def detect_language(entry):
title = entry.get("title")
original_content = entry.get("content")
content = (
original_content[0].get("value")
if original_content
else entry.get("summary")
)
text = title + " " + content
source_language = "auto"
try:
source_language = detect(text)
except Exception as e:
logging.warning("Cannot detect source language:%s,%s", e, text)

return source_language


def clean_content(content: str) -> str:
Expand Down

0 comments on commit bff61db

Please sign in to comment.