feat: add option to append translation to post data

author: Pinapelz <yukais@pinapelz.com> 2025-04-16 21:03:25 -0700
committer: Pinapelz <yukais@pinapelz.com> 2025-04-16 21:03:25 -0700
commit: df87d043d485d0e4282e57679c133c3c71a837bf (patch)
tree: 1a6c34b10105df43ecf9056b07426c2ae8b7d479
parent: 2a87b5d4961cb6275eb78737d5cbc35d6b0e45cc (diff)
6 files changed, 126 insertions, 2 deletions
diff --git a/.gitignore b/.gitignore
index e2ee484..2072926 100644
--- a/.gitignore
+++ b/.gitignore
@@ -171,3 +171,4 @@ cython_debug/
 # PyPI configuration file
 .pypirc
 news
+tl_cache.json
diff --git a/bemani/iidx.py b/bemani/iidx.py
index 0d97e91..67e1085 100644
--- a/bemani/iidx.py
+++ b/bemani/iidx.py
@@ -3,6 +3,9 @@ from datetime import datetime
 from urllib.parse import urljoin
 import re
 
+IIDX_KEY_TERMS = [
+    ("クプロ", "QPro")
+]
 
 def parse_pinky_crush_news_site(html: str, base_url):
     type_map = {
diff --git a/constants.py b/constants.py
index 6d13daf..50f9166 100644
--- a/constants.py
+++ b/constants.py
@@ -12,6 +12,8 @@ MAIMAIDX_JP_NEWS_SITE="https://info-maimai.sega.jp/"
 MAIMAIDX_INTL_NEWS_SITE="https://maimai.sega.com/download/"
 ONGEKI_JP_NEWS_SITE="https://info-ongeki.sega.jp/"
 
+ADD_EN_TRANSLATION=True # Only takes effect if an API key is provided in .env
+
 class CHUNITHM_VERSION(Enum):
     LUMINOUS_PLUS = 1
     VERSE = 2
diff --git a/generate.py b/generate.py
index 71093e5..e974bfa 100644
--- a/generate.py
+++ b/generate.py
@@ -4,7 +4,6 @@ Generally you're expected to update the game versions manually
 as for most games you only ever want the latest version (supported) of the game
 """
 import news_feed as feed
-import requests
 import constants
 import json
 import os
diff --git a/news_feed.py b/news_feed.py
index 87782bf..18c2616 100644
--- a/news_feed.py
+++ b/news_feed.py
@@ -16,7 +16,6 @@ Generic format for a news entry. All keys are considered to be nullable
 }
 """
 
-from email.utils import parsedate_to_datetime
 from site_scraper import SiteScraper, download_site_as_html
 import bemani.sdvx as sound_voltex
 import bemani.iidx as iidx
@@ -26,6 +25,7 @@ import sega.maimaidx_jp as maimaidx_jp
 import sega.maimaidx_intl as maimaidx_intl
 import sega.ongeki_jp as ongeki_jp
 import constants
+import translate
 
 def get_news(news_url: str, version=None) -> list:
     if news_url == constants.SOUND_VOLTEX_EXCEED_GEAR_NEWS_SITE:
@@ -35,6 +35,7 @@ def get_news(news_url: str, version=None) -> list:
     elif news_url == constants.IIDX_PINKY_CRUSH_NEWS_SITE:
         site_data = download_site_as_html(news_url)
         news_posts = sorted(iidx.parse_pinky_crush_news_site(site_data, constants.EAMUSEMENT_BASE_URL), key=lambda x: x['timestamp'], reverse=True)
+        news_posts = translate.add_translate_text_to_en(news_posts)
 
     elif news_url == constants.CHUNITHM_JP_NEWS_SITE:
         site_data = download_site_as_html(news_url)
diff --git a/translate.py b/translate.py
new file mode 100644
index 0000000..64ba018
--- /dev/null
+++ b/translate.py
@@ -0,0 +1,118 @@
+from dotenv import load_dotenv
+import requests
+import constants
+import re
+import os
+import json
+import hashlib
+
+
+load_dotenv()
+
+def _encode_links(markdown_text: str) -> tuple:
+    """
+    Find all occurrences of markdown links, replace them with 573_UPDATE_MARKDOWN_LINK_N where N is the nth link,
+    and record the word, its markdown replacement, and the occurrence count.
+    """
+    link_pattern = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
+    links = []
+    link_count = 0
+
+    def replacer(match):
+        nonlocal link_count
+        link_count += 1
+        markdown_replacement = match.group(0)
+        placeholder = f"573_UPDATE_MARKDOWN_LINK_{link_count}"
+        links.append((placeholder, markdown_replacement))
+        return placeholder
+
+    return link_pattern.sub(replacer, markdown_text), links
+
+def _decode_links(raw_text: str, links: list) -> str:
+    """
+    Replaces the placeholders with hyperlinks
+    """
+    for link in links:
+        raw_text = raw_text.replace(link[0], link[1])
+    return raw_text
+
+def _load_translation_cache() -> list:
+    cache_file = "tl_cache.json"
+    tl_map = {}
+    if os.path.exists(cache_file):
+        with open(cache_file, "r", encoding="utf-8") as file:
+            entries = json.load(file)
+            for entry in entries:
+                key = hashlib.sha256((entry["source_lang"] + entry["target_lang"] + entry["source_txt"]).encode('utf-8')).hexdigest()
+                tl_map[key] = entry["result_txt"]
+            return tl_map
+    else:
+        with open(cache_file, "w", encoding="utf-8") as file:
+            json.dump([], file, ensure_ascii=False, indent=4)
+        return {}
+
+def _add_to_translation_cache(source_lang: str, target_lang: str, source_txt: str, result_txt: str) -> None:
+    cache_file = "tl_cache.json"
+    cache_entry = {
+        "source_lang": source_lang,
+        "target_lang": target_lang,
+        "source_txt": source_txt,
+        "result_txt": result_txt
+    }
+    if os.path.exists(cache_file):
+        with open(cache_file, "r", encoding="utf-8") as file:
+            cache = json.load(file)
+    else:
+        cache = []
+    cache.append(cache_entry)
+    with open(cache_file, "w", encoding="utf-8") as file:
+        json.dump(cache, file, ensure_ascii=False, indent=4)
+
+def request_google_translate(text: str, source: str="ja", target="en", translation_cache=None) -> tuple:
+    """
+    Translates input text and returns the translated text using Google Cloud Translation API.
+    """
+    key = hashlib.sha256((source + target + text).encode('utf-8')).hexdigest()
+    if translation_cache and key in translation_cache:
+        return translation_cache[key]
+    API_KEY = os.getenv("GOOGLE_TRANSLATE_API_KEY")
+    encoded_text, restore_data = _encode_links(text)
+    url = "https://translation.googleapis.com/language/translate/v2"
+    params = {
+        "q": text,
+        "source": source,
+        "target": target,
+        "format": "text",
+        "key": API_KEY,
+    }
+    response = requests.post(url, params=params)
+    data = response.json()
+    translated_text = data["data"]["translations"][0]["translatedText"]
+    translation_cache[key] = translated_text
+    _add_to_translation_cache(source, target, text, translated_text)
+    return _decode_links(translated_text, restore_data)
+
+def translation_possible() -> bool:
+    return constants.ADD_EN_TRANSLATION and os.getenv("GOOGLE_TRANSLATE_API_KEY") is not None
+
+def add_translate_text_to_en(news_post: dict, overrides: list=[]) -> dict:
+    """
+    Takes a news post dict as input, then appends the translated EN headline and content
+    to the newspost and returns it
+    """
+    translated_posts = []
+    translation_cache = _load_translation_cache()
+    for post in news_post:
+        headline = post["headline"]
+        if headline:
+            for override in overrides:
+                headline = headline.replace(override[0], override[1])
+            post["en_headline"] = request_google_translate(headline, translation_cache=translation_cache)
+        content = post["content"]
+        if content:
+            for override in overrides:
+                content = content.replace(override[0], override[1])
+            en_content = request_google_translate(content, translation_cache=translation_cache)
+            post["en_content"] = en_content
+        translated_posts.append(post)
+    return translated_posts
author	Pinapelz <yukais@pinapelz.com>	2025-04-16 21:03:25 -0700
committer	Pinapelz <yukais@pinapelz.com>	2025-04-16 21:03:25 -0700
commit	df87d043d485d0e4282e57679c133c3c71a837bf (patch)
tree	1a6c34b10105df43ecf9056b07426c2ae8b7d479
parent	2a87b5d4961cb6275eb78737d5cbc35d6b0e45cc (diff)