diff options
| author | Pinapelz <yukais@pinapelz.com> | 2025-04-16 21:03:25 -0700 |
|---|---|---|
| committer | Pinapelz <yukais@pinapelz.com> | 2025-04-16 21:03:25 -0700 |
| commit | df87d043d485d0e4282e57679c133c3c71a837bf (patch) | |
| tree | 1a6c34b10105df43ecf9056b07426c2ae8b7d479 | |
| parent | 2a87b5d4961cb6275eb78737d5cbc35d6b0e45cc (diff) | |
feat: add option to append translation to post data
| -rw-r--r-- | .gitignore | 1 | ||||
| -rw-r--r-- | bemani/iidx.py | 3 | ||||
| -rw-r--r-- | constants.py | 2 | ||||
| -rw-r--r-- | generate.py | 1 | ||||
| -rw-r--r-- | news_feed.py | 3 | ||||
| -rw-r--r-- | translate.py | 118 |
6 files changed, 126 insertions, 2 deletions
@@ -171,3 +171,4 @@ cython_debug/ # PyPI configuration file .pypirc news +tl_cache.json diff --git a/bemani/iidx.py b/bemani/iidx.py index 0d97e91..67e1085 100644 --- a/bemani/iidx.py +++ b/bemani/iidx.py @@ -3,6 +3,9 @@ from datetime import datetime from urllib.parse import urljoin import re +IIDX_KEY_TERMS = [ + ("クプロ", "QPro") +] def parse_pinky_crush_news_site(html: str, base_url): type_map = { diff --git a/constants.py b/constants.py index 6d13daf..50f9166 100644 --- a/constants.py +++ b/constants.py @@ -12,6 +12,8 @@ MAIMAIDX_JP_NEWS_SITE="https://info-maimai.sega.jp/" MAIMAIDX_INTL_NEWS_SITE="https://maimai.sega.com/download/" ONGEKI_JP_NEWS_SITE="https://info-ongeki.sega.jp/" +ADD_EN_TRANSLATION=True # Only takes effect if an API key is provided in .env + class CHUNITHM_VERSION(Enum): LUMINOUS_PLUS = 1 VERSE = 2 diff --git a/generate.py b/generate.py index 71093e5..e974bfa 100644 --- a/generate.py +++ b/generate.py @@ -4,7 +4,6 @@ Generally you're expected to update the game versions manually as for most games you only ever want the latest version (supported) of the game """ import news_feed as feed -import requests import constants import json import os diff --git a/news_feed.py b/news_feed.py index 87782bf..18c2616 100644 --- a/news_feed.py +++ b/news_feed.py @@ -16,7 +16,6 @@ Generic format for a news entry. All keys are considered to be nullable } """ -from email.utils import parsedate_to_datetime from site_scraper import SiteScraper, download_site_as_html import bemani.sdvx as sound_voltex import bemani.iidx as iidx @@ -26,6 +25,7 @@ import sega.maimaidx_jp as maimaidx_jp import sega.maimaidx_intl as maimaidx_intl import sega.ongeki_jp as ongeki_jp import constants +import translate def get_news(news_url: str, version=None) -> list: if news_url == constants.SOUND_VOLTEX_EXCEED_GEAR_NEWS_SITE: @@ -35,6 +35,7 @@ def get_news(news_url: str, version=None) -> list: elif news_url == constants.IIDX_PINKY_CRUSH_NEWS_SITE: site_data = download_site_as_html(news_url) news_posts = sorted(iidx.parse_pinky_crush_news_site(site_data, constants.EAMUSEMENT_BASE_URL), key=lambda x: x['timestamp'], reverse=True) + news_posts = translate.add_translate_text_to_en(news_posts) elif news_url == constants.CHUNITHM_JP_NEWS_SITE: site_data = download_site_as_html(news_url) diff --git a/translate.py b/translate.py new file mode 100644 index 0000000..64ba018 --- /dev/null +++ b/translate.py @@ -0,0 +1,118 @@ +from dotenv import load_dotenv +import requests +import constants +import re +import os +import json +import hashlib + + +load_dotenv() + +def _encode_links(markdown_text: str) -> tuple: + """ + Find all occurrences of markdown links, replace them with 573_UPDATE_MARKDOWN_LINK_N where N is the nth link, + and record the word, its markdown replacement, and the occurrence count. + """ + link_pattern = re.compile(r'\[([^\]]+)\]\(([^)]+)\)') + links = [] + link_count = 0 + + def replacer(match): + nonlocal link_count + link_count += 1 + markdown_replacement = match.group(0) + placeholder = f"573_UPDATE_MARKDOWN_LINK_{link_count}" + links.append((placeholder, markdown_replacement)) + return placeholder + + return link_pattern.sub(replacer, markdown_text), links + +def _decode_links(raw_text: str, links: list) -> str: + """ + Replaces the placeholders with hyperlinks + """ + for link in links: + raw_text = raw_text.replace(link[0], link[1]) + return raw_text + +def _load_translation_cache() -> list: + cache_file = "tl_cache.json" + tl_map = {} + if os.path.exists(cache_file): + with open(cache_file, "r", encoding="utf-8") as file: + entries = json.load(file) + for entry in entries: + key = hashlib.sha256((entry["source_lang"] + entry["target_lang"] + entry["source_txt"]).encode('utf-8')).hexdigest() + tl_map[key] = entry["result_txt"] + return tl_map + else: + with open(cache_file, "w", encoding="utf-8") as file: + json.dump([], file, ensure_ascii=False, indent=4) + return {} + +def _add_to_translation_cache(source_lang: str, target_lang: str, source_txt: str, result_txt: str) -> None: + cache_file = "tl_cache.json" + cache_entry = { + "source_lang": source_lang, + "target_lang": target_lang, + "source_txt": source_txt, + "result_txt": result_txt + } + if os.path.exists(cache_file): + with open(cache_file, "r", encoding="utf-8") as file: + cache = json.load(file) + else: + cache = [] + cache.append(cache_entry) + with open(cache_file, "w", encoding="utf-8") as file: + json.dump(cache, file, ensure_ascii=False, indent=4) + +def request_google_translate(text: str, source: str="ja", target="en", translation_cache=None) -> tuple: + """ + Translates input text and returns the translated text using Google Cloud Translation API. + """ + key = hashlib.sha256((source + target + text).encode('utf-8')).hexdigest() + if translation_cache and key in translation_cache: + return translation_cache[key] + API_KEY = os.getenv("GOOGLE_TRANSLATE_API_KEY") + encoded_text, restore_data = _encode_links(text) + url = "https://translation.googleapis.com/language/translate/v2" + params = { + "q": text, + "source": source, + "target": target, + "format": "text", + "key": API_KEY, + } + response = requests.post(url, params=params) + data = response.json() + translated_text = data["data"]["translations"][0]["translatedText"] + translation_cache[key] = translated_text + _add_to_translation_cache(source, target, text, translated_text) + return _decode_links(translated_text, restore_data) + +def translation_possible() -> bool: + return constants.ADD_EN_TRANSLATION and os.getenv("GOOGLE_TRANSLATE_API_KEY") is not None + +def add_translate_text_to_en(news_post: dict, overrides: list=[]) -> dict: + """ + Takes a news post dict as input, then appends the translated EN headline and content + to the newspost and returns it + """ + translated_posts = [] + translation_cache = _load_translation_cache() + for post in news_post: + headline = post["headline"] + if headline: + for override in overrides: + headline = headline.replace(override[0], override[1]) + post["en_headline"] = request_google_translate(headline, translation_cache=translation_cache) + content = post["content"] + if content: + for override in overrides: + content = content.replace(override[0], override[1]) + en_content = request_google_translate(content, translation_cache=translation_cache) + post["en_content"] = en_content + translated_posts.append(post) + return translated_posts |
