translate.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136

from dotenv import load_dotenv
import requests
import constants
import re
import os
import json
import hashlib


load_dotenv()

def _encode_links(markdown_text: str) -> tuple:
    """
    Find all occurrences of markdown links, replace them with 573_UPDATE_MARKDOWN_LINK_N where N is the nth link,
    and record the word, its markdown replacement, and the occurrence count.
    """
    link_pattern = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
    links = []
    link_count = 0

    def replacer(match):
        nonlocal link_count
        link_count += 1
        markdown_replacement = match.group(0)
        placeholder = f"573_UPDATE_MARKDOWN_LINK_{link_count}"
        links.append((placeholder, markdown_replacement))
        return placeholder

    return link_pattern.sub(replacer, markdown_text), links

def _decode_links(raw_text: str, links: list) -> str:
    """
    Replaces the placeholders with hyperlinks
    """
    for link in links:
        raw_text = raw_text.replace(link[0], link[1])
    return raw_text

def _load_translation_cache() -> dict:
    cache_file = "tl_cache.json"
    tl_map = {}
    if os.path.exists(cache_file):
        try:
            with open(cache_file, "r", encoding="utf-8") as file:
                entries = json.load(file)
                for entry in entries:
                    key = hashlib.sha256((entry["source_lang"] + entry["target_lang"] + entry["source_txt"]).encode('utf-8')).hexdigest()
                    tl_map[key] = entry["result_txt"]
                return tl_map
        except (UnicodeDecodeError, json.JSONDecodeError, KeyError) as e:
            print(f"Translation cache corrupted ({e}), deleting and starting fresh...")
            os.remove(cache_file)
            with open(cache_file, "w", encoding="utf-8") as file:
                json.dump([], file, ensure_ascii=False, indent=4)
            return {}
    else:
        with open(cache_file, "w", encoding="utf-8") as file:
            json.dump([], file, ensure_ascii=False, indent=4)
        return {}

def _add_to_translation_cache(source_lang: str, target_lang: str, source_txt: str, result_txt: str) -> None:
    cache_file = "tl_cache.json"
    cache_entry = {
        "source_lang": source_lang,
        "target_lang": target_lang,
        "source_txt": source_txt,
        "result_txt": result_txt
    }
    try:
        if os.path.exists(cache_file):
            with open(cache_file, "r", encoding="utf-8") as file:
                cache = json.load(file)
        else:
            cache = []
        cache.append(cache_entry)
        with open(cache_file, "w", encoding="utf-8") as file:
            json.dump(cache, file, ensure_ascii=False, indent=4)
    except (UnicodeDecodeError, json.JSONDecodeError) as e:
        print(f"Translation cache corrupted during write ({e}), starting fresh...")
        cache = [cache_entry]
        with open(cache_file, "w", encoding="utf-8") as file:
            json.dump(cache, file, ensure_ascii=False, indent=4)

def request_google_translate(text: str, source: str="ja", target="en", translation_cache=None) -> tuple:
    """
    Translates input text and returns the translated text using Google Cloud Translation API.
    """
    key = hashlib.sha256((source + target + text).encode('utf-8')).hexdigest()
    if translation_cache and key in translation_cache:
        return translation_cache[key]
    API_KEY = os.getenv("GOOGLE_TRANSLATE_API_KEY")
    encoded_text, restore_data = _encode_links(text)
    url = "https://translation.googleapis.com/language/translate/v2?key="+API_KEY
    payload = {
        "q": text,
        "source": source,
        "target": target,
        "format": "text",
    }
    response = requests.post(url, json=payload)
    data = response.json()
    translated_text = data["data"]["translations"][0]["translatedText"]
    translation_cache[key] = translated_text
    _add_to_translation_cache(source, target, text, translated_text)
    return _decode_links(translated_text, restore_data)

def translation_possible() -> bool:
    return constants.ADD_EN_TRANSLATION and os.getenv("GOOGLE_TRANSLATE_API_KEY") is not None

def add_translate_text_to_en(news_post: dict, overrides: list=[]) -> dict:
    """
    Takes a news post dict as input, then appends the translated EN headline and content
    to the newspost and returns it
    """
    if not translation_possible():
        return news_post
    translated_posts = []
    translation_cache = _load_translation_cache()
    for post in news_post:
        headline = post.get("headline")
        if headline:
            for override in overrides:
                headline = headline.replace(override[0], override[1])
            post["en_headline"] = request_google_translate(headline, translation_cache=translation_cache)
        else:
            post["en_headline"] = None
        content = post.get("content")
        if content:
            for override in overrides:
                content = content.replace(override[0], override[1])
            en_content = request_google_translate(content, translation_cache=translation_cache)
            post["en_content"] = en_content
        else:
            post["en_content"] = None
        translated_posts.append(post)
    return translated_posts