diff options
| -rw-r--r-- | .gitignore | 1 | ||||
| -rw-r--r-- | community/wacca_plus/wacca_plus.py | 13 | ||||
| -rw-r--r-- | generate.py | 1 | ||||
| -rw-r--r-- | news_feed.py | 16 | ||||
| -rw-r--r-- | requirements.txt | bin | 836 -> 1384 bytes | |||
| -rw-r--r-- | site/src/components/NewsFeed.tsx | 7 | ||||
| -rw-r--r-- | summarizer.py | 100 |
7 files changed, 133 insertions, 5 deletions
@@ -173,3 +173,4 @@ cython_debug/ news tl_cache.json wac_result_cache.json +summarization_cache.json
\ No newline at end of file diff --git a/community/wacca_plus/wacca_plus.py b/community/wacca_plus/wacca_plus.py index d5e0aa4..fae3dd0 100644 --- a/community/wacca_plus/wacca_plus.py +++ b/community/wacca_plus/wacca_plus.py @@ -6,6 +6,10 @@ import openai import json from dotenv import load_dotenv import base64 +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))) + +from summarizer import generate_headline_and_content_from_images load_dotenv() @@ -99,7 +103,9 @@ def parse_announcement_messages(message_json: dict): continue filtered_images = [] + image_urls = [] # save the images before they get encoded for image in image_attachments: + image_urls.append(image["url"]) if image["id"] in cache: is_related = cache[image["id"]][0] type = cache[image["id"]][1] @@ -116,17 +122,18 @@ def parse_announcement_messages(message_json: dict): date = message["timestamp"].split("T")[0] date_obj = datetime.strptime(date, "%Y-%m-%d") unix_time = int(time.mktime(date_obj.timetuple())) + headline, content = generate_headline_and_content_from_images(image_urls, "WACCA PLUS") news_posts.append({ "date": date, "identifier": "WACCA_PLUS", "type": type.upper(), "timestamp": unix_time, - "content": "NEW INFORMATION FROM WACCA PLUS / WACCA PLUS の最新情報", - "headline": None, + "content": content, + "headline": headline, "url": None, "images": filtered_images, - 'is_ai_summary': False + 'is_ai_summary': True }) _save_cache(cache) diff --git a/generate.py b/generate.py index afb96a8..a3edbd3 100644 --- a/generate.py +++ b/generate.py @@ -3,7 +3,6 @@ Generates news JSON files Generally you're expected to update the game versions manually as for most games you only ever want the latest version (supported) of the game """ -from ast import Constant import news_feed as feed import constants import json diff --git a/news_feed.py b/news_feed.py index 18e9dbd..d78c78c 100644 --- a/news_feed.py +++ b/news_feed.py @@ -14,7 +14,8 @@ Generic format for a news entry. All keys are considered to be nullable 'link': If there's an associated href. Else None } - ] + ], + 'is_ai_summary': boolean } """ @@ -36,6 +37,17 @@ import community.museca_plus as mus_plus import community.rbdx as rbdx import constants import translate +import summarizer + +def _attach_llm_summaries(news_posts: list, game_name: str): + for post in news_posts: + image_urls = [img["image"] for img in post.get("images", []) if "image" in img] + if image_urls: + headline, content = summarizer.generate_headline_and_content_from_images(image_urls, game_name) + post["headline"] = headline + post["content"] = content + post["is_ai_summary"] = True + def get_news(news_url: str, version=None) -> list: if news_url == constants.SOUND_VOLTEX_EXCEED_GEAR_NEWS_SITE: @@ -124,6 +136,7 @@ def get_news(news_url: str, version=None) -> list: scraper.close() if version == constants.MAIMAIDX_VERSION.PRISM: news_posts = sorted(maimaidx_intl.parse_maimaidx_intl_prism_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) + _attach_llm_summaries(news_posts, "maimai DX International") elif news_url == constants.ONGEKI_JP_NEWS_SITE: site_data = download_site_as_html(news_url) @@ -154,6 +167,7 @@ def get_news(news_url: str, version=None) -> list: elif news_url == constants.RB_DELUXE_PLUS_NEWS: site_data = download_site_as_html(news_url) news_posts = rbdx.get_carousel_posts(site_data) + _attach_llm_summaries(news_posts, "REFLEC BEAT PLUS DELUXE") else: news_posts = [] diff --git a/requirements.txt b/requirements.txt Binary files differindex c5df683..7cbd3b8 100644 --- a/requirements.txt +++ b/requirements.txt diff --git a/site/src/components/NewsFeed.tsx b/site/src/components/NewsFeed.tsx index 0151975..3e6f6b9 100644 --- a/site/src/components/NewsFeed.tsx +++ b/site/src/components/NewsFeed.tsx @@ -16,6 +16,7 @@ export interface NewsData { }>; en_headline: string | null; en_content: string | null; + is_ai_summary: boolean | null; } interface NewsFeedProps { @@ -102,6 +103,12 @@ export const NewsFeed: React.FC<NewsFeedProps> = ({ newsItems }) => { </button> )} </div> + {/* AI Disclaimer */} + {news.is_ai_summary && ( + <div className={`${isMoe ? "bg-pink-200 text-pink-800" : "bg-gray-800 text-white"} px-3 py-1 text-xs text-center`}> + The information above is summarized by AI / 上記の情報はAIによって生成されました。 + </div> + )} {/* Images */} {news.images.length > 0 && ( diff --git a/summarizer.py b/summarizer.py new file mode 100644 index 0000000..d3d66f1 --- /dev/null +++ b/summarizer.py @@ -0,0 +1,100 @@ +from dotenv import load_dotenv +import openai +import json +import hashlib +import os + +load_dotenv() + + +def summarization_is_possible() -> bool: + return os.getenv("OPENAI_API_KEY") + + +def _load_cache(): + cache_file = "summarization_cache.json" + if not os.path.exists(cache_file): + with open(cache_file, "w") as file: + json.dump({}, file) + with open(cache_file, "r") as file: + return json.load(file) + + +def _save_cache(cache: dict): + cache_file = "summarization_cache.json" + with open(cache_file, "w") as file: + json.dump(cache, file) + + +def _make_cache_key(game: str, img_urls: list[str]) -> str: + normalized_game = game.strip().lower() + img_data = json.dumps(sorted(img_urls), separators=(",", ":")) + hash_digest = hashlib.sha256(img_data.encode()).hexdigest()[:12] + return f"{normalized_game}_{hash_digest}" + + +def generate_headline_and_content_from_images(img_urls: list[str], game: str): + """ + Uses LLM to generate the headline and content when none provided by source, based on one or more images. + """ + cache = _load_cache() + cache_key = _make_cache_key(game, img_urls) + if cache_key in cache: + cached = cache[cache_key] + return cached["headline"], cached["content"] + tools = [ + { + "type": "function", + "function": { + "name": "generate_update_text", + "description": "Generates a concise English headline and short description for a rhythm game update image.", + "parameters": { + "type": "object", + "properties": { + "headline": { + "type": "string", + "description": "A short English headline summarizing the game update.", + }, + "content": { + "type": "string", + "description": "A brief English description of the new content shown in the image(s).", + }, + }, + "required": ["headline", "content"], + }, + }, + } + ] + + messages = [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": ( + f"Given one or more update-related images for the arcade game {game}, return a short, professional English headline and a brief, stern and concise description summarizing the content. No need to repeat game name" + ), + }, + *[{"type": "image_url", "image_url": {"url": url}} for url in img_urls], + ], + } + ] + + response = openai.chat.completions.create( + model="gpt-4o", + messages=messages, + tools=tools, + tool_choice={ + "type": "function", + "function": {"name": "generate_update_text"}, + }, + ) + + tool_result = response.choices[0].message.tool_calls[0].function.arguments + parsed_result = json.loads(tool_result) + headline = parsed_result["headline"] + content = parsed_result["content"] + cache[cache_key] = {"headline": headline, "content": content} + _save_cache(cache) + return headline, content |
