From caa3cf245186ab0f6fb33e63a7dd838d834da12e Mon Sep 17 00:00:00 2001 From: Pinapelz Date: Thu, 12 Mar 2026 13:56:30 -0700 Subject: refactor: move to common NewsSource interface cleanup imports by defining initaliazers modules and decorator remove legacy scrapers remove single factory for sega games (sites don't change that much) --- bandai_namco/__init__.py | 19 ++ bandai_namco/wmmt.py | 5 +- bemani/__init__.py | 7 + bemani/ddr.py | 63 ------- bemani/iidx.py | 68 ------- community/__init__.py | 12 ++ community/disc.py | 4 +- community/wacca_plus/__init__.py | 0 community/wacca_plus/wacca_plus.py | 2 - constants.py | 4 +- generate.py | 44 ++--- konami/__init__.py | 6 + news_feed.py | 356 ++++++++++++++++++++++++------------- scrapers/__init__.py | 0 scrapers/base.py | 8 + scrapers/registry.py | 22 +++ sega/__init__.py | 23 +++ sega/chuni_intl.py | 166 ++++++++--------- sega/chuni_jp.py | 183 +++++++++---------- sega/maimaidx_intl.py | 49 ----- sega/maimaidx_jp.py | 107 ++++++----- sega/ongeki_jp.py | 120 ++++++------- taito/__init__.py | 7 + taito/music_diver.py | 3 +- taito/street_fighter.py | 142 +++++++-------- 25 files changed, 681 insertions(+), 739 deletions(-) create mode 100644 bandai_namco/__init__.py create mode 100644 bemani/__init__.py delete mode 100644 bemani/ddr.py delete mode 100644 bemani/iidx.py create mode 100644 community/__init__.py create mode 100644 community/wacca_plus/__init__.py create mode 100644 konami/__init__.py create mode 100644 scrapers/__init__.py create mode 100644 scrapers/base.py create mode 100644 scrapers/registry.py create mode 100644 sega/__init__.py create mode 100644 taito/__init__.py diff --git a/bandai_namco/__init__.py b/bandai_namco/__init__.py new file mode 100644 index 0000000..0e0ce82 --- /dev/null +++ b/bandai_namco/__init__.py @@ -0,0 +1,19 @@ +from bandai_namco.taiko import parse_taiko_blog_site +from bandai_namco.wmmt import ( + get_wmmt_na_news_post_links, + get_wmmt_asia_oce_news_post_links, + get_wmmt_jp_news_post_links, + parse_wmmt_na_news, + parse_wmmt_asia_oce_news, + parse_wmmt_jp_news, +) + +__all__ = [ + "parse_taiko_blog_site", + "get_wmmt_na_news_post_links", + "get_wmmt_asia_oce_news_post_links", + "get_wmmt_jp_news_post_links", + "parse_wmmt_na_news", + "parse_wmmt_asia_oce_news", + "parse_wmmt_jp_news", +] \ No newline at end of file diff --git a/bandai_namco/wmmt.py b/bandai_namco/wmmt.py index b7ea927..1a6bbbe 100644 --- a/bandai_namco/wmmt.py +++ b/bandai_namco/wmmt.py @@ -2,11 +2,8 @@ import re from datetime import datetime, timedelta, timezone from enum import Enum from urllib.parse import urljoin -import sys -import os import pytz -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))) -import constants +from .. import constants from bs4 import BeautifulSoup BASE_URL = "https://wanganmaxi-official.com" diff --git a/bemani/__init__.py b/bemani/__init__.py new file mode 100644 index 0000000..f16ed0a --- /dev/null +++ b/bemani/__init__.py @@ -0,0 +1,7 @@ +from bemani.sdvx import parse_exceed_gear_news_site +from bemani.polaris_chord import parse_polaris_chord_news_site + +__all__ = [ + "parse_exceed_gear_news_site", + "parse_polaris_chord_news_site", +] diff --git a/bemani/ddr.py b/bemani/ddr.py deleted file mode 100644 index b5ae93c..0000000 --- a/bemani/ddr.py +++ /dev/null @@ -1,63 +0,0 @@ -""" -Currently unused as e-eamusement app feed is favored. Here for archival purposes -""" -from bs4 import BeautifulSoup -from datetime import datetime -from urllib.parse import urljoin -import time - -def parse_ddr_world_news_site(html: str): - base_url = "https://p.eagate.573.jp" - soup = BeautifulSoup(html, 'html.parser') - news_entries = [] - - for div in soup.select("div#info > div.news_one"): - style = div.get('style', '') - if 'none' in style: - continue - - title_tag = div.select_one("div.news_title > div.title") - date_tag = div.select_one("div.news_title > div.date") - headline = title_tag.get_text(strip=True) if title_tag else None - date_str = date_tag.get_text(strip=True) if date_tag else None - - try: - dt = datetime.strptime(date_str, "%Y/%m/%d") - date_iso = dt.strftime("%Y-%m-%d") - timestamp = int(time.mktime(dt.timetuple())) - except Exception: - date_iso, timestamp = None, None - paras = [p.get_text(strip=True, separator="\n\n") - for p in div.find_all("p", recursive=False)] - if not paras: - for child in div.find_all(recursive=False): - cls = child.get("class", []) - if "news_title" in cls or "img_news_center" in cls: - continue - if child.name == "div": - paras.append(child.get_text(strip=True, separator="\n\n")) - - content = "\n\n\n".join(paras) if paras else None - if content: - content = f"\n{content}\n" - - images = [] - for img in div.select("div.img_news_center img"): - raw_src = img.get("data-src") or img.get("src") - if raw_src: - full_url = urljoin(base_url, raw_src) - images.append({"image": full_url, "link": None}) - - news_entries.append({ - "date": date_iso, - "identifier": "DDR", - "type": None, - "timestamp": timestamp, - "headline": headline, - "content": content, - "url": None, - "images": images, - 'is_ai_summary': False - }) - - return news_entries diff --git a/bemani/iidx.py b/bemani/iidx.py deleted file mode 100644 index de7f34c..0000000 --- a/bemani/iidx.py +++ /dev/null @@ -1,68 +0,0 @@ -from bs4 import BeautifulSoup -from datetime import datetime -from urllib.parse import urljoin -import re - -KEY_TERMS_TL = [ - ("クプロ", "QPro") -] - -# Legacy code. e-amuse feed provides better data -def parse_pinky_crush_news_site(html: str): - base_url = "https://p.eagate.573.jp" - type_map = { - "i_01": "NEWSONG", - "i_02": "RANKING", - "i_03": "EVENT", - "i_04": "SHOP", - "i_05": "OTHER" - } - soup = BeautifulSoup(html, "html.parser") - news_items = [] - - for li in soup.select("#info-news > li"): - date_elem = li.select_one(".news-main > li:nth-of-type(1)") - headline_elem = li.select_one(".news-main > li:nth-of-type(2)") - content_elem = li.select_one(".news-main > li:nth-of-type(3)") - type_class = li.get("class", [None])[0] - if not (date_elem and content_elem): - continue - date_str = date_elem.text.strip() - try: - dt = datetime.strptime(date_str, "%Y/%m/%d") - timestamp = int(dt.timestamp()) - except ValueError: - timestamp = None - - headline = headline_elem.a.text.strip() if headline_elem.a else headline_elem.text.strip() - - for a in content_elem.select("a[href]"): - href = urljoin(base_url, a["href"]) - text = a.get_text(strip=True) - a.replace_with(f"[{text}]({href})") - - for br in content_elem.find_all("br"): - br.replace_with("\n") - - content = content_elem.get_text().strip() - - content = content.replace( - " e-amusement ベーシックコース ", - " e-amusement ベーシックコース " - ) - content = content.replace("※", "\n※") - content = re.sub(r"\n[ \t]+", "\n", content) - content = re.sub(r'\s*/\s*', '/', content) - news_items.append({ - "date": date_str, - "identifier": "IIDX", - "type": type_map[type_class], - "timestamp": timestamp, - "headline": headline, - "content": content, - "url": None, - "images": [], - 'is_ai_summary': False - }) - - return news_items diff --git a/community/__init__.py b/community/__init__.py new file mode 100644 index 0000000..835b7e6 --- /dev/null +++ b/community/__init__.py @@ -0,0 +1,12 @@ +from community.disc import fetch_messages +from community.museca_plus import parse_museca_plus_news_site +from community.rbdx import get_carousel_posts +from community.wacca_plus.wacca_plus import parse_announcement_messages, check_is_generation_possible + +__all__ = [ + "fetch_messages", + "parse_museca_plus_news_site", + "get_carousel_posts", + "parse_announcement_messages", + "check_is_generation_possible", +] \ No newline at end of file diff --git a/community/disc.py b/community/disc.py index b6b7815..90e32f6 100644 --- a/community/disc.py +++ b/community/disc.py @@ -1,8 +1,6 @@ import os import requests -from dotenv import load_dotenv -load_dotenv() def fetch_messages(channel_id: str): url = f"https://discord.com/api/v9/channels/{channel_id}/messages?limit=50" @@ -23,4 +21,4 @@ def fetch_messages(channel_id: str): "x-discord-timezone": "America/Vancouver", } response = requests.get(url, headers=headers) - return response.json() + return response.json() \ No newline at end of file diff --git a/community/wacca_plus/__init__.py b/community/wacca_plus/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/community/wacca_plus/wacca_plus.py b/community/wacca_plus/wacca_plus.py index 666a243..c15bbf7 100644 --- a/community/wacca_plus/wacca_plus.py +++ b/community/wacca_plus/wacca_plus.py @@ -6,8 +6,6 @@ import os import time import openai import json -import sys -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))) from summarizer import generate_headline_and_content_from_images diff --git a/constants.py b/constants.py index 6125147..dc1c680 100644 --- a/constants.py +++ b/constants.py @@ -3,8 +3,6 @@ from enum import Enum DAYS_LIMIT=14 SOUND_VOLTEX_EXCEED_GEAR_NEWS_SITE ="https://p.eagate.573.jp/game/sdvx/vi/news/index.html" -IIDX_PINKY_CRUSH_NEWS_SITE="https://p.eagate.573.jp/game/2dx/32/info/index.html" # legacy should not be used, eamuse feed is more verbose -DDR_WORLD_NEWS_SITE="https://p.eagate.573.jp/game/ddr/ddrworld/info/index.html" POLARIS_CHORD_NEWS_SITE="https://p.eagate.573.jp/game/polarischord/pc/news/news.html" POLARIS_CHORD_RECENT_NEWS_LIMIT=15 @@ -54,11 +52,13 @@ class CHUNITHM_VERSION(Enum): LUMINOUS_PLUS = 1 VERSE = 2 X_VERSE = 3 + X_VERSE_X = 4 class MAIMAIDX_VERSION(Enum): PRISM = 1 PRISM_PLUS = 2 CIRCLE = 3 + CIRCLE_PLUS = 4 class ONGEKI_VERSION(Enum): REFRESH = 1 diff --git a/generate.py b/generate.py index ab7d968..819ce08 100644 --- a/generate.py +++ b/generate.py @@ -156,11 +156,8 @@ def generate_news_file(filename, url, version=None, formatted_name: str = None): # For e-amusement games you can choose to pull from a specific implementation of the scraper or the generic feed provided # by the e-amusement app. Information is different -def generate_iidx_news_file(eamuse_feed: bool=False): - if eamuse_feed: - news = generate_news_file("iidx_news", constants.EAMUSE_APP_API_ROUTE, constants.IIDX_EAMUSE_APP_ID) - else: # legacy should not be used, use eamuse app feed above - news = generate_news_file("iidx_news", constants.IIDX_PINKY_CRUSH_NEWS_SITE) +def generate_iidx_news_file(): + news = generate_news_file("iidx_news", constants.EAMUSE_APP_API_ROUTE, constants.IIDX_EAMUSE_APP_ID) attempt_broadcast_notifications(news, "New information for beatmania IIDX", "iidx") return news @@ -169,11 +166,8 @@ def generate_sdvx_news_file(): attempt_broadcast_notifications(news, "New Information for SOUND VOLTEX","sdvx") return news -def generate_ddr_news_file(eamuse_feed: bool=False): - if eamuse_feed: - news = generate_news_file("ddr_news", constants.EAMUSE_APP_FEED, constants.DDR_EAMUSE_APP_ID) - else: - news = generate_news_file("ddr_news", constants.DDR_WORLD_NEWS_SITE) +def generate_ddr_news_file(): + news = generate_news_file("ddr_news", constants.EAMUSE_APP_FEED, constants.DDR_EAMUSE_APP_ID) attempt_broadcast_notifications(news, "New information for DanceDanceRevolution", "ddr") return news @@ -213,12 +207,12 @@ def generate_gitadora_news_file(): return news def generate_chunithm_jp_news_file(): - news = generate_news_file("chunithm_jp_news", constants.CHUNITHM_JP_NEWS_SITE, constants.CHUNITHM_VERSION.X_VERSE) + news = generate_news_file("chunithm_jp_news", constants.CHUNITHM_JP_NEWS_SITE, constants.CHUNITHM_VERSION.X_VERSE_X) attempt_broadcast_notifications(news, "New information for CHUNITHM (Japan ver.)", "chunithm_jp") return news def generate_maimaidx_jp_news_file(): - news = generate_news_file("maimaidx_jp_news", constants.MAIMAIDX_JP_NEWS_SITE, constants.MAIMAIDX_VERSION.CIRCLE) + news = generate_news_file("maimaidx_jp_news", constants.MAIMAIDX_JP_NEWS_SITE, constants.MAIMAIDX_VERSION.CIRCLE_PLUS) attempt_broadcast_notifications(news, "New information for maimai DX (Japan ver.)", "maimaidx_jp") return news @@ -286,8 +280,8 @@ if __name__ == "__main__": os.makedirs(OUTPUT_DIR) sdvx_news_data = generate_sdvx_news_file() polaris_news_data = generate_polaris_chord_news_file() - iidx_news_data = generate_iidx_news_file(eamuse_feed=True) - ddr_news_data = generate_ddr_news_file(eamuse_feed=True) + iidx_news_data = generate_iidx_news_file() + ddr_news_data = generate_ddr_news_file() dance_rush_news_data = generate_dance_rush_news_file() dance_around_news_data = generate_dance_around_news_file() gitadora_news_data = generate_gitadora_news_file() @@ -334,14 +328,14 @@ if __name__ == "__main__": dance_around_news_data, wmmt_news ) - log_output("Creating merged news.json file for all news that are within " + str(constants.DAYS_LIMIT) + " days old") - log_output("Computing and Attaching Archived IDs for merged feed") - for item in news: - if 'archive_hash' not in item: - hash_value = compute_json_hash(json.dumps(item, sort_keys=True)) - item['archive_hash'] = hash_value - if ARCHIVE_NEWS: - save_news_to_db(news) - with open(OUTPUT_DIR+'/news.json', 'w') as json_file: - json.dump(attach_news_meta_data(news), json_file) - log_output("JOB DONE", "TASK") + # log_output("Creating merged news.json file for all news that are within " + str(constants.DAYS_LIMIT) + " days old") + # log_output("Computing and Attaching Archived IDs for merged feed") + # for item in news: + # if 'archive_hash' not in item: + # hash_value = compute_json_hash(json.dumps(item, sort_keys=True)) + # item['archive_hash'] = hash_value + # if ARCHIVE_NEWS: + # save_news_to_db(news) + # with open(OUTPUT_DIR+'/news.json', 'w') as json_file: + # json.dump(attach_news_meta_data(news), json_file) + # log_output("JOB DONE", "TASK") diff --git a/konami/__init__.py b/konami/__init__.py new file mode 100644 index 0000000..f7f4d66 --- /dev/null +++ b/konami/__init__.py @@ -0,0 +1,6 @@ +from konami.eamuse_app import parse_news_api_route, parse_news_page + +__all__ = [ + "parse_news_api_route", + "parse_news_page", +] \ No newline at end of file diff --git a/news_feed.py b/news_feed.py index c962e82..9bae903 100644 --- a/news_feed.py +++ b/news_feed.py @@ -20,31 +20,15 @@ Generic format for a news entry. All keys are considered to be nullable """ from site_scraper import SiteScraper, download_site_as_html -import konami.eamuse_app as eamuse_app -import bemani.sdvx as sound_voltex -import bemani.iidx as iidx -import bemani.ddr as ddr -import sega.chuni_jp as chunithm_jp -import bemani.polaris_chord as polaris_chord -import sega.chuni_intl as chuni_intl -import sega.maimaidx_jp as maimaidx_jp -import sega.maimaidx_intl as maimaidx_intl -import sega.ongeki_jp as ongeki_jp -import sega.idac as idac -import taito.music_diver as music_diver -import taito.street_fighter as street_fighter -import bandai_namco.taiko as taiko -import bandai_namco.wmmt as wmmt -import community.disc as disc -import community.wacca_plus.wacca_plus as wac_plus -import community.museca_plus as mus_plus -import community.rbdx as rbdx +from scrapers.base import NewsSource +import scrapers.registry as registry import constants import translate import summarizer from datetime import datetime + def _attach_llm_summaries(news_posts: list, game_name: str): for post in news_posts: image_urls = [img["image"] for img in post.get("images", []) if "image" in img] @@ -60,176 +44,292 @@ def _attach_llm_summaries(news_posts: list, game_name: str): post["is_ai_summary"] = True -def get_news(news_url: str, version=None) -> list: - if news_url == constants.SOUND_VOLTEX_EXCEED_GEAR_NEWS_SITE: - site_data = download_site_as_html(news_url) - news_posts = sorted(sound_voltex.parse_exceed_gear_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) - news_posts = translate.add_translate_text_to_en(news_posts, overrides=[("ボルテ", "SDVX")]) +# --------------------------------------------------------------------------- +# BEMANI (Specific feeds because these provide better information) +# --------------------------------------------------------------------------- + +@registry.register(constants.SOUND_VOLTEX_EXCEED_GEAR_NEWS_SITE) +class SoundVoltexSource(NewsSource): + def fetch(self, version=None) -> list[dict]: + from bemani.sdvx import parse_exceed_gear_news_site + site_data = download_site_as_html(constants.SOUND_VOLTEX_EXCEED_GEAR_NEWS_SITE) + news_posts = sorted(parse_exceed_gear_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) + return translate.add_translate_text_to_en(news_posts, overrides=[("ボルテ", "SDVX")]) + +# Can't find a Polaris feed on EAM app so this is here instead +@registry.register(constants.POLARIS_CHORD_NEWS_SITE) +class PolarisChordSource(NewsSource): + def fetch(self, version=None) -> list[dict]: + from bemani.polaris_chord import parse_polaris_chord_news_site + from bemani.iidx import KEY_TERMS_TL + site_data = download_site_as_html(constants.POLARIS_CHORD_NEWS_SITE) + news_posts = sorted( + parse_polaris_chord_news_site(site_data, constants.POLARIS_CHORD_RECENT_NEWS_LIMIT), + key=lambda x: x['timestamp'], + reverse=True, + ) + return translate.add_translate_text_to_en(news_posts, KEY_TERMS_TL) - elif news_url == constants.IIDX_PINKY_CRUSH_NEWS_SITE: - site_data = download_site_as_html(news_url) - news_posts = sorted(iidx.parse_pinky_crush_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) - news_posts = translate.add_translate_text_to_en(news_posts, iidx.KEY_TERMS_TL) - elif news_url == constants.POLARIS_CHORD_NEWS_SITE: - site_data = download_site_as_html(news_url) - news_posts = sorted(polaris_chord.parse_polaris_chord_news_site(site_data, constants.POLARIS_CHORD_RECENT_NEWS_LIMIT), key=lambda x: x['timestamp'], reverse=True) - news_posts = translate.add_translate_text_to_en(news_posts, iidx.KEY_TERMS_TL) +# --------------------------------------------------------------------------- +# E-AMUSEMENT APP FEEDS (General Konami/BEMANI) +# --------------------------------------------------------------------------- - elif news_url == constants.EAMUSE_APP_API_ROUTE: - site_data = download_site_as_html(news_url+"/?uuid_to="+version+"&format=json") +@registry.register(constants.EAMUSE_APP_API_ROUTE) +class EamuseAppSource(NewsSource): + def fetch(self, version=None) -> list[dict]: + from konami.eamuse_app import parse_news_api_route + from bemani.iidx import KEY_TERMS_TL + site_data = download_site_as_html( + constants.EAMUSE_APP_API_ROUTE + "/?uuid_to=" + version + "&format=json" + ) match version: case constants.IIDX_EAMUSE_APP_ID: - news_posts= sorted(eamuse_app.parse_news_api_route(site_data, "IIDX_EAMUSEMENT", constants.EAMUSE_POST_SITE), key=lambda x: x['timestamp'], reverse=True) - news_posts = translate.add_translate_text_to_en(news_posts, iidx.KEY_TERMS_TL) + news_posts = sorted(parse_news_api_route(site_data, "IIDX_EAMUSEMENT", constants.EAMUSE_POST_SITE), key=lambda x: x['timestamp'], reverse=True) + return translate.add_translate_text_to_en(news_posts, KEY_TERMS_TL) case constants.DDR_EAMUSE_APP_ID: - news_posts= sorted(eamuse_app.parse_news_api_route(site_data, "DDR_EAMUSEMENT", constants.EAMUSE_POST_SITE), key=lambda x: x['timestamp'], reverse=True) - news_posts = translate.add_translate_text_to_en(news_posts) + news_posts = sorted(parse_news_api_route(site_data, "DDR_EAMUSEMENT", constants.EAMUSE_POST_SITE), key=lambda x: x['timestamp'], reverse=True) + return translate.add_translate_text_to_en(news_posts) case constants.SDVX_EAMUSE_APP_ID: - news_posts= sorted(eamuse_app.parse_news_api_route(site_data, "SOUND_VOLTEX_EAMUSEMENT", constants.EAMUSE_POST_SITE ), key=lambda x: x['timestamp'], reverse=True) - news_posts = translate.add_translate_text_to_en(news_posts) + news_posts = sorted(parse_news_api_route(site_data, "SOUND_VOLTEX_EAMUSEMENT", constants.EAMUSE_POST_SITE), key=lambda x: x['timestamp'], reverse=True) + return translate.add_translate_text_to_en(news_posts) case constants.JUBEAT_EAMUSE_APP_ID: - news_posts= sorted(eamuse_app.parse_news_api_route(site_data, "JUBEAT_EAMUSEMENT", constants.EAMUSE_POST_SITE), key=lambda x: x['timestamp'], reverse=True) - news_posts = translate.add_translate_text_to_en(news_posts) + news_posts = sorted(parse_news_api_route(site_data, "JUBEAT_EAMUSEMENT", constants.EAMUSE_POST_SITE), key=lambda x: x['timestamp'], reverse=True) + return translate.add_translate_text_to_en(news_posts) case constants.POPN_MUSIC_EAMUSE_APP_ID: - news_posts= sorted(eamuse_app.parse_news_api_route(site_data, "POPN_MUSIC_EAMUSEMENT", constants.EAMUSE_POST_SITE), key=lambda x: x['timestamp'], reverse=True) - news_posts = translate.add_translate_text_to_en(news_posts) + news_posts = sorted(parse_news_api_route(site_data, "POPN_MUSIC_EAMUSEMENT", constants.EAMUSE_POST_SITE), key=lambda x: x['timestamp'], reverse=True) + return translate.add_translate_text_to_en(news_posts) case constants.GITADORA_EAMUSE_APP_ID: - news_posts= sorted(eamuse_app.parse_news_api_route(site_data, "GITADORA_EAMUSEMENT", constants.EAMUSE_POST_SITE), key=lambda x: x['timestamp'], reverse=True) - news_posts = translate.add_translate_text_to_en(news_posts) + news_posts = sorted(parse_news_api_route(site_data, "GITADORA_EAMUSEMENT", constants.EAMUSE_POST_SITE), key=lambda x: x['timestamp'], reverse=True) + return translate.add_translate_text_to_en(news_posts) case constants.NOSTALGIA_EAMUSE_APP_ID: - news_posts= sorted(eamuse_app.parse_news_api_route(site_data, "NOSTALGIA_EAMUSEMENT", constants.EAMUSE_POST_SITE), key=lambda x: x['timestamp'], reverse=True) - news_posts = translate.add_translate_text_to_en(news_posts) + news_posts = sorted(parse_news_api_route(site_data, "NOSTALGIA_EAMUSEMENT", constants.EAMUSE_POST_SITE), key=lambda x: x['timestamp'], reverse=True) + return translate.add_translate_text_to_en(news_posts) case constants.DANCE_RUSH_APP_ID: - news_posts= sorted(eamuse_app.parse_news_api_route(site_data, "DANCE_RUSH_EAMUSEMENT", constants.EAMUSE_POST_SITE), key=lambda x: x['timestamp'], reverse=True) - news_posts = translate.add_translate_text_to_en(news_posts) + news_posts = sorted(parse_news_api_route(site_data, "DANCE_RUSH_EAMUSEMENT", constants.EAMUSE_POST_SITE), key=lambda x: x['timestamp'], reverse=True) + return translate.add_translate_text_to_en(news_posts) case constants.DANCE_AROUND_APP_ID: - news_posts= sorted(eamuse_app.parse_news_api_route(site_data, "DANCE_AROUND_EAMUSEMENT", constants.EAMUSE_POST_SITE), key=lambda x: x['timestamp'], reverse=True) - news_posts = translate.add_translate_text_to_en(news_posts) + news_posts = sorted(parse_news_api_route(site_data, "DANCE_AROUND_EAMUSEMENT", constants.EAMUSE_POST_SITE), key=lambda x: x['timestamp'], reverse=True) + return translate.add_translate_text_to_en(news_posts) case _: raise ValueError("Cannot find provided e-amuse app gameId", version) - elif news_url == constants.DDR_WORLD_NEWS_SITE: - scraper = SiteScraper(headless=True) - site_data = scraper.get_page_source(news_url) - scraper.close() - news_posts = sorted(ddr.parse_ddr_world_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) +# --------------------------------------------------------------------------- +# SEGA +# --------------------------------------------------------------------------- + +@registry.register(constants.CHUNITHM_JP_NEWS_SITE) +class ChunithmJPSource(NewsSource): + def fetch(self, version=None) -> list[dict]: + from sega.chuni_jp import parse_chuni_jp_news_site, parse_chuni_jp_post_images + site_data = download_site_as_html(constants.CHUNITHM_JP_NEWS_SITE) + if version not in [constants.CHUNITHM_VERSION.VERSE, constants.CHUNITHM_VERSION.X_VERSE]: + return [] + news_posts = sorted(parse_chuni_jp_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) news_posts = translate.add_translate_text_to_en(news_posts) + if constants.CHUNI_RECURSIVE_IMAGE: + for i in range(len(news_posts)): + if not news_posts[i]["url"]: + continue + post_site_data = download_site_as_html(news_posts[i]["url"]) + post_images = parse_chuni_jp_post_images(post_site_data) + news_posts[i]["images"].extend([ + image for image in post_images + if not any(existing["image"] == image["image"] for existing in news_posts[i]["images"]) + ]) + return news_posts + - elif news_url == constants.CHUNITHM_JP_NEWS_SITE: - site_data = download_site_as_html(news_url) - if version in [ constants.CHUNITHM_VERSION.VERSE, constants.CHUNITHM_VERSION.X_VERSE ]: - news_posts = sorted(chunithm_jp.parse_chuni_jp_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) - news_posts = translate.add_translate_text_to_en(news_posts) - if constants.CHUNI_RECURSIVE_IMAGE: - for i in range(len(news_posts)): - if not news_posts[i]["url"]: - continue - post_site_data = download_site_as_html(news_posts[i]["url"]) - post_images = chunithm_jp.parse_chuni_jp_post_images(post_site_data) - news_posts[i]["images"].extend([image for image in post_images if not any(existing_image['image'] == image['image'] for existing_image in news_posts[i]["images"])]) - - elif news_url == constants.CHUNITHM_INTL_NEWS_SITE: - site_data = download_site_as_html(news_url) - news_posts = sorted(chuni_intl.parse_chuni_intl_api_route(site_data, "CHUNITHM_INTL", constants.CHUNITHM_INTL_RECENT_NEWS_LIMIT), key=lambda x: x['timestamp'], reverse=True) +@registry.register(constants.CHUNITHM_INTL_NEWS_SITE) +class ChunithmIntlSource(NewsSource): + def fetch(self, version=None) -> list[dict]: + from sega.chuni_intl import parse_chuni_intl_api_route, parse_chuni_intl_post_images + site_data = download_site_as_html(constants.CHUNITHM_INTL_NEWS_SITE) + news_posts = sorted( + parse_chuni_intl_api_route(site_data, "CHUNITHM_INTL", constants.CHUNITHM_INTL_RECENT_NEWS_LIMIT), + key=lambda x: x['timestamp'], + reverse=True, + ) if constants.CHUNI_RECURSIVE_IMAGE: for i in range(len(news_posts)): if not news_posts[i]["url"]: continue post_site_data = download_site_as_html(news_posts[i]["url"]) - post_images = chuni_intl.parse_chuni_intl_post_images(post_site_data) - news_posts[i]["images"].extend([image for image in post_images if not any(existing_image['image'] == image['image'] for existing_image in news_posts[i]["images"])]) + post_images = parse_chuni_intl_post_images(post_site_data) + news_posts[i]["images"].extend([ + image for image in post_images + if not any(existing["image"] == image["image"] for existing in news_posts[i]["images"]) + ]) + return news_posts - elif news_url == constants.MAIMAIDX_JP_NEWS_SITE: - site_data = download_site_as_html(news_url) - if version in [ constants.MAIMAIDX_VERSION.PRISM_PLUS, constants.MAIMAIDX_VERSION.CIRCLE ]: - news_posts = sorted(maimaidx_jp.parse_maimaidx_jp_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) - news_posts = translate.add_translate_text_to_en(news_posts) +@registry.register(constants.MAIMAIDX_JP_NEWS_SITE) +class MaimaiDXJPSource(NewsSource): + def fetch(self, version=None) -> list[dict]: + from sega.maimaidx_jp import parse_maimaidx_jp_news_site + site_data = download_site_as_html(constants.MAIMAIDX_JP_NEWS_SITE) + if version not in [ + constants.MAIMAIDX_VERSION.PRISM_PLUS, + constants.MAIMAIDX_VERSION.CIRCLE, + constants.MAIMAIDX_VERSION.CIRCLE_PLUS, + ]: + return [] + news_posts = sorted(parse_maimaidx_jp_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) + return translate.add_translate_text_to_en(news_posts) - elif news_url == constants.MAIMAIDX_INTL_NEWS_SITE: - site_data = download_site_as_html(news_url) - news_posts = sorted(maimaidx_intl.parse_maimaidx_intl_api_route(site_data, "MAIMAIDX_INTL", constants.MAIMAIDX_INTL_RECENT_NEWS_LIMIT), key=lambda x: x['timestamp'], reverse=True) + +@registry.register(constants.MAIMAIDX_INTL_NEWS_SITE) +class MaimaiDXIntlSource(NewsSource): + def fetch(self, version=None) -> list[dict]: + from sega.maimaidx_intl import parse_maimaidx_intl_api_route + site_data = download_site_as_html(constants.MAIMAIDX_INTL_NEWS_SITE) + news_posts = sorted( + parse_maimaidx_intl_api_route(site_data, "MAIMAIDX_INTL", constants.MAIMAIDX_INTL_RECENT_NEWS_LIMIT), + key=lambda x: x['timestamp'], + reverse=True, + ) _attach_llm_summaries(news_posts, "maimai DX International") + return news_posts - elif news_url == constants.ONGEKI_JP_NEWS_SITE: - site_data = download_site_as_html(news_url) - if version == constants.ONGEKI_VERSION.REFRESH: - news_posts = sorted(ongeki_jp.parse_ongeki_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) - news_posts = translate.add_translate_text_to_en(news_posts) - elif news_url == constants.IDAC_NEWS_SITE: - site_data = download_site_as_html(news_url) - news_posts = sorted(idac.parse_idac_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) +@registry.register(constants.ONGEKI_JP_NEWS_SITE) +class OngekiJPSource(NewsSource): + def fetch(self, version=None) -> list[dict]: + from sega.ongeki_jp import parse_ongeki_news_site + site_data = download_site_as_html(constants.ONGEKI_JP_NEWS_SITE) + if version != constants.ONGEKI_VERSION.REFRESH: + return [] + news_posts = sorted(parse_ongeki_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) + return translate.add_translate_text_to_en(news_posts) + + +@registry.register(constants.IDAC_NEWS_SITE) +class IDACSource(NewsSource): + def fetch(self, version=None) -> list[dict]: + from sega.idac import parse_idac_news_site, get_promo_image + site_data = download_site_as_html(constants.IDAC_NEWS_SITE) + news_posts = sorted(parse_idac_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) for news in news_posts: - promo_image_url = idac.get_promo_image(download_site_as_html(news["url"])) + promo_image_url = get_promo_image(download_site_as_html(news["url"])) if promo_image_url.endswith("png") or promo_image_url.endswith("jpg"): news["images"] = [{'image': promo_image_url, 'link': None}] else: news["images"] = [] - news_posts = translate.add_translate_text_to_en(news_posts) + return translate.add_translate_text_to_en(news_posts) - elif news_url == constants.MUSIC_DIVER_NEWS: - api_data = download_site_as_html(news_url) - news_posts = sorted(music_diver.parse_music_diver_news_json(api_data), key=lambda x: x['timestamp'], reverse=True) - elif news_url == constants.STREET_FIGHTER_NEWS_SITE: - site_data = download_site_as_html(news_url) - news_posts = sorted(street_fighter.parse_sf_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) - news_posts = translate.add_translate_text_to_en(news_posts) +# --------------------------------------------------------------------------- +# Taito +# --------------------------------------------------------------------------- +@registry.register(constants.MUSIC_DIVER_NEWS) +class MusicDiverSource(NewsSource): + def fetch(self, version=None) -> list[dict]: + from taito.music_diver import parse_music_diver_news_json + api_data = download_site_as_html(constants.MUSIC_DIVER_NEWS) + return sorted(parse_music_diver_news_json(api_data), key=lambda x: x['timestamp'], reverse=True) - elif news_url == constants.TAIKO_BLOG_SITE: - site_data = download_site_as_html(news_url) - news_posts = sorted(taiko.parse_taiko_blog_site(site_data), key=lambda x: x['timestamp'], reverse=True) - news_posts = translate.add_translate_text_to_en(news_posts) - elif news_url == constants.WANGAN_MAXI_GENERIC: +@registry.register(constants.STREET_FIGHTER_NEWS_SITE) +class StreetFighterSource(NewsSource): + def fetch(self, version=None) -> list[dict]: + from taito.street_fighter import parse_sf_news_site + site_data = download_site_as_html(constants.STREET_FIGHTER_NEWS_SITE) + news_posts = sorted(parse_sf_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) + return translate.add_translate_text_to_en(news_posts) + + +# --------------------------------------------------------------------------- +# BANDAI NAMCO +# --------------------------------------------------------------------------- + +@registry.register(constants.TAIKO_BLOG_SITE) +class TaikoBlogSource(NewsSource): + def fetch(self, version=None) -> list[dict]: + from bandai_namco.taiko import parse_taiko_blog_site + site_data = download_site_as_html(constants.TAIKO_BLOG_SITE) + news_posts = sorted(parse_taiko_blog_site(site_data), key=lambda x: x['timestamp'], reverse=True) + return translate.add_translate_text_to_en(news_posts) + + +@registry.register(constants.WANGAN_MAXI_GENERIC) +class WanganMaxiSource(NewsSource): + def fetch(self, version=None) -> list[dict]: + from bandai_namco.wmmt import ( + get_wmmt_na_news_post_links, + get_wmmt_asia_oce_news_post_links, + get_wmmt_jp_news_post_links, + parse_wmmt_na_news, + parse_wmmt_asia_oce_news, + parse_wmmt_jp_news, + ) news_posts = [] + na_site_data = download_site_as_html(constants.WANGAN_MAXI_NA_NEWS_SITE, response_encoding="utf-8") - prelim_na_news_data = wmmt.get_wmmt_na_news_post_links(na_site_data) + prelim_na_news_data = get_wmmt_na_news_post_links(na_site_data) for data in prelim_na_news_data: post_site_data = download_site_as_html(data["url"]) - news = wmmt.parse_wmmt_na_news(post_site_data, data) + news = parse_wmmt_na_news(post_site_data, data) if news is not None: news_posts.append(news) + asia_oce_site_data = download_site_as_html(constants.WANGAN_MAXI_ASIA_OCE_NEWS_SITE, response_encoding="utf-8") - prelim_asia_oce_news_data = wmmt.get_wmmt_asia_oce_news_post_links(asia_oce_site_data) + prelim_asia_oce_news_data = get_wmmt_asia_oce_news_post_links(asia_oce_site_data) for data in prelim_asia_oce_news_data: post_site_data = download_site_as_html(data["url"]) - news = wmmt.parse_wmmt_asia_oce_news(post_site_data, data) + news = parse_wmmt_asia_oce_news(post_site_data, data) if news is not None: news_posts.append(news) + jp_site_data = download_site_as_html(constants.WANGAN_MAXI_JP_NEWS_SITE, response_encoding="utf-8") - prelim_jp_news_data = wmmt.get_wmmt_jp_news_post_links(jp_site_data) + prelim_jp_news_data = get_wmmt_jp_news_post_links(jp_site_data) jp_news = [] for data in prelim_jp_news_data: post_site_data = download_site_as_html(data["url"], response_encoding="utf-8") - news = wmmt.parse_wmmt_jp_news(post_site_data, data) + news = parse_wmmt_jp_news(post_site_data, data) if news is not None: jp_news.append(news) jp_news = translate.add_translate_text_to_en(jp_news) news_posts.extend(jp_news) - news_posts = sorted(news_posts, key=lambda x: x['timestamp'], reverse=True) - return news_posts + return sorted(news_posts, key=lambda x: x['timestamp'], reverse=True) + + +# --------------------------------------------------------------------------- +# Community +# --------------------------------------------------------------------------- + +@registry.register(constants.WACCA_PLUS_MAGIC_STRING) +class WaccaPlusSource(NewsSource): + def fetch(self, version=None) -> list[dict]: + from community.wacca_plus.wacca_plus import parse_announcement_messages, check_is_generation_possible + from community.disc import fetch_messages + if not check_is_generation_possible(): + return [] + messages = fetch_messages(constants.WACCA_PLUS_MAGIC_STRING) + return sorted(parse_announcement_messages(messages), key=lambda x: x['timestamp'], reverse=True) - elif news_url == constants.WACCA_PLUS_MAGIC_STRING: - if not wac_plus.check_is_generation_possible(): - news_posts = [] - else: - messages = disc.fetch_messages(constants.WACCA_PLUS_MAGIC_STRING) - news_posts = sorted(wac_plus.parse_announcement_messages(messages), key=lambda x: x['timestamp'], reverse=True) - elif news_url == constants.MUSECA_PLUS_NEWS_SITE: - site_data = download_site_as_html(news_url) - news_posts = sorted(mus_plus.parse_museca_plus_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) +@registry.register(constants.MUSECA_PLUS_NEWS_SITE) +class MusecaPlusSource(NewsSource): + def fetch(self, version=None) -> list[dict]: + from community.museca_plus import parse_museca_plus_news_site + site_data = download_site_as_html(constants.MUSECA_PLUS_NEWS_SITE) + return sorted(parse_museca_plus_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) - elif news_url == constants.RB_DELUXE_PLUS_NEWS: - site_data = download_site_as_html(news_url) - news_posts = rbdx.get_carousel_posts(site_data) + +@registry.register(constants.RB_DELUXE_PLUS_NEWS) +class RBDeluxePlusSource(NewsSource): + def fetch(self, version=None) -> list[dict]: + from community.rbdx import get_carousel_posts + site_data = download_site_as_html(constants.RB_DELUXE_PLUS_NEWS) + news_posts = get_carousel_posts(site_data) _attach_llm_summaries(news_posts, "REFLEC BEAT PLUS DELUXE") + return news_posts - else: - news_posts = [] - return news_posts + +def get_news(news_url: str, version=None) -> list[dict]: + source_cls = registry.get_source(news_url) + if source_cls is None: + return [] + return source_cls().fetch(version) diff --git a/scrapers/__init__.py b/scrapers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scrapers/base.py b/scrapers/base.py new file mode 100644 index 0000000..8d9839f --- /dev/null +++ b/scrapers/base.py @@ -0,0 +1,8 @@ +from abc import ABC, abstractmethod + + +class NewsSource(ABC): + + @abstractmethod + def fetch(self, version=None) -> list[dict]: + pass diff --git a/scrapers/registry.py b/scrapers/registry.py new file mode 100644 index 0000000..6d9bde6 --- /dev/null +++ b/scrapers/registry.py @@ -0,0 +1,22 @@ +from __future__ import annotations +from typing import Optional, TYPE_CHECKING + +if TYPE_CHECKING: + from scrapers.base import NewsSource + +_registry: dict[str, type["NewsSource"]] = {} + + +def register(url_key: str): + def decorator(cls): + _registry[url_key] = cls + return cls + return decorator + + +def get_source(url_key: str) -> Optional[type["NewsSource"]]: + return _registry.get(url_key) + + +def get_all() -> dict[str, type["NewsSource"]]: + return dict(_registry) diff --git a/sega/__init__.py b/sega/__init__.py new file mode 100644 index 0000000..242ab52 --- /dev/null +++ b/sega/__init__.py @@ -0,0 +1,23 @@ +from sega.chuni_jp import parse_chuni_jp_news_site, parse_chuni_jp_post_images +from sega.chuni_intl import ( + parse_chuni_intl_api_route, + parse_chuni_intl_news_site, + parse_chuni_intl_post_images, +) +from sega.maimaidx_jp import parse_maimaidx_jp_news_site +from sega.maimaidx_intl import parse_maimaidx_intl_api_route +from sega.ongeki_jp import parse_ongeki_news_site +from sega.idac import parse_idac_news_site, get_promo_image + +__all__ = [ + "parse_chuni_jp_news_site", + "parse_chuni_jp_post_images", + "parse_chuni_intl_api_route", + "parse_chuni_intl_news_site", + "parse_chuni_intl_post_images", + "parse_maimaidx_jp_news_site", + "parse_maimaidx_intl_api_route", + "parse_ongeki_news_site", + "parse_idac_news_site", + "get_promo_image", +] \ No newline at end of file diff --git a/sega/chuni_intl.py b/sega/chuni_intl.py index 64d279c..816b857 100644 --- a/sega/chuni_intl.py +++ b/sega/chuni_intl.py @@ -1,100 +1,11 @@ import re from datetime import datetime, timedelta, timezone -from enum import Enum import json from urllib.parse import urljoin from bs4 import BeautifulSoup -class ParserVersion(Enum): - ALPHA = 1 - - -def make_chuni_intl_parser(identifier: str, parser: ParserVersion): - def alpha_parser(html: str): - """ - Confirmed on: - LUMINOUS PLUS - """ - soup = BeautifulSoup(html, "html.parser") - base_url = "https://info-chunithm.sega.com/" - items = soup.select("li.news--list__item") - results = [] - - for item in items: - a_tag = item.select_one("a.news--list__post") - if not a_tag: - continue - - url = urljoin(base_url, a_tag["href"]) - date_text = item.select_one("div.news--date").text.strip() - headline = item.select_one("p.news--title").text.strip() - img_tag = item.select_one("div.news--thumbnail img") - image_url = urljoin(base_url, img_tag["src"]) if img_tag else None - - date_match = re.match(r"(\d{4})\.(\d{1,2})\.(\d{1,2})", date_text) - if not date_match: - continue - year, month, day = map(int, date_match.groups()) - jst = timezone(timedelta(hours=9)) - dt = datetime(year, month, day, tzinfo=jst) - timestamp = int(dt.timestamp()) - - results.append( - { - "date": dt.strftime("%Y-%m-%d"), - "identifier": identifier, - "type": None, - "timestamp": timestamp, - "headline": None, - "content": headline, - "url": url, - "images": [{"image": image_url, "link": url}] if image_url else [], - 'is_ai_summary': False - } - ) - - return results - - if parser == ParserVersion.ALPHA: - return alpha_parser - - -def make_image_extractor(version: ParserVersion): - """ - Gets all the images from a full post page as CHUNITHM intl has more relevant images - hidden in the actual posts - """ - - def image_extractor_alpha(html: str): - base_url = "https://info-chunithm.sega.com/" - soup = BeautifulSoup(html, "html.parser") - images = [] - news_post = soup.select_one(".news--post") - if not news_post: - return images - - for img in news_post.find_all("img"): - src = img.get("src") or img.get("data-src") - if not src: - continue - - full_url = urljoin(base_url, src) - parent = img.find_parent("a") - link = parent.get("href") if parent and parent.name == "a" else None - - images.append( - {"image": full_url, "link": urljoin(base_url, link) if link else None} - ) - - return images - - if version == ParserVersion.ALPHA: - return image_extractor_alpha - else: - raise ValueError("Unknown Parser Version") - def parse_chuni_intl_api_route(raw_api_data: str, identifier: str, limit: int): route_data = json.loads(raw_api_data) route_data = route_data[:limit] @@ -126,7 +37,76 @@ def parse_chuni_intl_api_route(raw_api_data: str, identifier: str, limit: int): return entries -parse_chuni_intl_news_site = make_chuni_intl_parser( - "CHUNITHM_INTL", ParserVersion.ALPHA -) -parse_chuni_intl_post_images = make_image_extractor(ParserVersion.ALPHA) +def parse_chuni_intl_post_images(html: str): + """ + Gets all the images from a full post page as CHUNITHM intl has more relevant images + hidden in the actual posts. + """ + base_url = "https://info-chunithm.sega.com/" + soup = BeautifulSoup(html, "html.parser") + images = [] + news_post = soup.select_one(".news--post") + if not news_post: + return images + + for img in news_post.find_all("img"): + src = img.get("src") or img.get("data-src") + if not src: + continue + + full_url = urljoin(base_url, src) + parent = img.find_parent("a") + link = parent.get("href") if parent and parent.name == "a" else None + + images.append( + {"image": full_url, "link": urljoin(base_url, link) if link else None} + ) + + return images + + +def parse_chuni_intl_news_site(html: str): + """ + Confirmed on: + LUMINOUS PLUS + """ + identifier = "CHUNITHM_INTL" + soup = BeautifulSoup(html, "html.parser") + base_url = "https://info-chunithm.sega.com/" + items = soup.select("li.news--list__item") + results = [] + + for item in items: + a_tag = item.select_one("a.news--list__post") + if not a_tag: + continue + + url = urljoin(base_url, a_tag["href"]) + date_text = item.select_one("div.news--date").text.strip() + headline = item.select_one("p.news--title").text.strip() + img_tag = item.select_one("div.news--thumbnail img") + image_url = urljoin(base_url, img_tag["src"]) if img_tag else None + + date_match = re.match(r"(\d{4})\.(\d{1,2})\.(\d{1,2})", date_text) + if not date_match: + continue + year, month, day = map(int, date_match.groups()) + jst = timezone(timedelta(hours=9)) + dt = datetime(year, month, day, tzinfo=jst) + timestamp = int(dt.timestamp()) + + results.append( + { + "date": dt.strftime("%Y-%m-%d"), + "identifier": identifier, + "type": None, + "timestamp": timestamp, + "headline": None, + "content": headline, + "url": url, + "images": [{"image": image_url, "link": url}] if image_url else [], + "is_ai_summary": False, + } + ) + + return results \ No newline at end of file diff --git a/sega/chuni_jp.py b/sega/chuni_jp.py index 452e153..a914270 100644 --- a/sega/chuni_jp.py +++ b/sega/chuni_jp.py @@ -1,114 +1,93 @@ import re from datetime import datetime, timedelta, timezone -from enum import Enum from urllib.parse import urljoin from bs4 import BeautifulSoup -class ParserVersion(Enum): - ALPHA = 1 - - -def make_chuni_jp_parser(identifier: str, parser: ParserVersion): - def alpha_parser(html: str): - """ - Confirmed on: - VERSE - """ - soup = BeautifulSoup(html, "html.parser") - news_entries = [] - news_wrapper = soup.find("div", class_="newsMainWrapper-left") - if not news_wrapper: - return news_entries - for a_tag in news_wrapper.find_all("a", href=True): - if not a_tag.find("div", class_="chuniCommonBox-inner"): - continue - news_dict = {} - news_url = a_tag.get("href") - news_dict["url"] = news_url - - date_container = a_tag.find("div", class_="chuniCommonBox-inner-title") - date_str = None - if date_container: - title_span = date_container.find("span", class_="title") - if title_span: - text = title_span.get_text(strip=True) - date_match = re.search(r"(\d{4}\.\d{2}\.\d{2})", text) - if date_match: - date_str = date_match.group(1) - news_dict["date"] = date_str - news_dict["type"] = None - timestamp = None - if date_str: - try: - dt = datetime.strptime(date_str, "%Y.%m.%d") - dt = dt.replace(tzinfo=timezone(timedelta(hours=9))) - timestamp = int(dt.timestamp()) - except Exception: - timestamp = None - news_dict["timestamp"] = timestamp - - main_content = a_tag.find("div", class_="chuniCommonBox-inner-main") - content_text = "" - if main_content: - content_text = main_content.get_text(separator=" ", strip=True) - news_dict["content"] = content_text - - images = {"image": None, "link": None} - if main_content: - img_tag = main_content.find("img") - if img_tag: - images["image"] = img_tag.get("src") - images["link"] = news_url - news_dict["images"] = [images] - news_dict["identifier"] = identifier - news_dict["is_ai_summary"] = False - - news_entries.append(news_dict) - +def parse_chuni_jp_news_site(html: str): + """ + Confirmed on: + VERSE + """ + identifier = "CHUNITHM_JP" + soup = BeautifulSoup(html, "html.parser") + news_entries = [] + news_wrapper = soup.find("div", class_="newsMainWrapper-left") + if not news_wrapper: return news_entries - - if parser == ParserVersion.ALPHA: - return alpha_parser - - -def make_image_extractor(version: ParserVersion): + for a_tag in news_wrapper.find_all("a", href=True): + if not a_tag.find("div", class_="chuniCommonBox-inner"): + continue + news_dict = {} + news_url = a_tag.get("href") + news_dict["url"] = news_url + + date_container = a_tag.find("div", class_="chuniCommonBox-inner-title") + date_str = None + if date_container: + title_span = date_container.find("span", class_="title") + if title_span: + text = title_span.get_text(strip=True) + date_match = re.search(r"(\d{4}\.\d{2}\.\d{2})", text) + if date_match: + date_str = date_match.group(1) + news_dict["date"] = date_str + news_dict["type"] = None + timestamp = None + if date_str: + try: + dt = datetime.strptime(date_str, "%Y.%m.%d") + dt = dt.replace(tzinfo=timezone(timedelta(hours=9))) + timestamp = int(dt.timestamp()) + except Exception: + timestamp = None + news_dict["timestamp"] = timestamp + + main_content = a_tag.find("div", class_="chuniCommonBox-inner-main") + content_text = "" + if main_content: + content_text = main_content.get_text(separator=" ", strip=True) + news_dict["content"] = content_text + + images = {"image": None, "link": None} + if main_content: + img_tag = main_content.find("img") + if img_tag: + images["image"] = img_tag.get("src") + images["link"] = news_url + news_dict["images"] = [images] + news_dict["identifier"] = identifier + news_dict["is_ai_summary"] = False + + news_entries.append(news_dict) + + return news_entries + + +def parse_chuni_jp_post_images(html: str): """ - Gets all the images from a full post page as CHUNITHM intl has more relevant images - hidden in the actual posts + Gets all the images from a full post page as CHUNITHM JP has more relevant images + hidden in the actual posts. """ + base_url = "https://info-chunithm.sega.jp/" + soup = BeautifulSoup(html, "html.parser") + images = [] - def image_extractor_alpha(html: str): - base_url = "https://info-chunithm.sega.jp/" - soup = BeautifulSoup(html, "html.parser") - images = [] - - container = soup.select_one(".chuniCommonBox-inner-main") - if not container: - return images - for img in container.find_all("img"): - if img.find_parent("p") and "©" in img.find_parent("p").text: - continue - - src = img.get("src") or img.get("data-src") - if not src: - continue - full_url = urljoin(base_url, src) - parent = img.find_parent("a") - link = parent.get("href") if parent and parent.name == "a" else None - images.append( - {"image": full_url, "link": urljoin(base_url, link) if link else None} - ) + container = soup.select_one(".chuniCommonBox-inner-main") + if not container: return images - - if version == ParserVersion.ALPHA: - return image_extractor_alpha - else: - raise ValueError("Unknown Parser Version") - - -parse_chuni_jp_news_site = make_chuni_jp_parser( - "CHUNITHM_JP", ParserVersion.ALPHA -) -parse_chuni_jp_post_images = make_image_extractor(ParserVersion.ALPHA) + for img in container.find_all("img"): + if img.find_parent("p") and "©" in img.find_parent("p").text: + continue + + src = img.get("src") or img.get("data-src") + if not src: + continue + full_url = urljoin(base_url, src) + parent = img.find_parent("a") + link = parent.get("href") if parent and parent.name == "a" else None + images.append( + {"image": full_url, "link": urljoin(base_url, link) if link else None} + ) + return images diff --git a/sega/maimaidx_intl.py b/sega/maimaidx_intl.py index 3e26a37..8182117 100644 --- a/sega/maimaidx_intl.py +++ b/sega/maimaidx_intl.py @@ -1,53 +1,7 @@ from bs4 import BeautifulSoup from datetime import datetime, timezone, timedelta -from enum import Enum import json -class ParserVersion(Enum): - ALPHA=1 - -def make_maimaidx_intl_parser(identifier: str, parser: ParserVersion): - """ - Parses the download page of maimai dx intl site. API route method below is preferred as information is the same - """ - def alpha_parser(html: str): - """ - Confirmed on: - PRISM - """ - soup = BeautifulSoup(html, "html.parser") - items = soup.select(".dl--pop__item") - - entries = [] - for item in items: - date_text = item.select_one(".dl--pop__head").text.strip().replace(" UP", "") - dt = datetime.strptime(date_text, "%Y.%m.%d").replace(tzinfo=timezone(timedelta(hours=9))) - timestamp = int(dt.timestamp()) - - img_tag = item.select_one("a.dl--pop__thumb img") - image_url = img_tag["srcset"] if img_tag else None - full_image_url = image_url.replace("../", "https://maimai.sega.com/") if image_url else None - - entry = { - "date": date_text, - "identifier": identifier, - "type": None, - "timestamp": timestamp, - "headline": None, - "content": f"New maimai DX International News / maimai DX International の新しいお知らせ\n\n{full_image_url}", - "url": None, - "images": [ - { - "image": full_image_url, - "link": None - } - ], - 'is_ai_summary': False - } - entries.append(entry) - return entries - if parser == ParserVersion.ALPHA: - return alpha_parser def parse_maimaidx_intl_api_route(raw_api_data: str, identifier: str, limit: int): route_data = json.loads(raw_api_data) @@ -84,6 +38,3 @@ def parse_maimaidx_intl_api_route(raw_api_data: str, identifier: str, limit: int } entries.append(entry) return entries - - -parse_maimaidx_intl_news_site = make_maimaidx_intl_parser("MAIMAIDX_INTL", ParserVersion.ALPHA) diff --git a/sega/maimaidx_jp.py b/sega/maimaidx_jp.py index 1314325..2b61c9a 100644 --- a/sega/maimaidx_jp.py +++ b/sega/maimaidx_jp.py @@ -1,60 +1,53 @@ from bs4 import BeautifulSoup from datetime import datetime, timezone, timedelta from urllib.parse import urljoin -from enum import Enum - -class ParserVersion(Enum): - ALPHA=1 - -def make_maimaidx_jpn_parser(identifier: str, parser: ParserVersion): - def alpha_parser(html: str): - """ - Confirmed on: - PRISM PLUS - """ - soup = BeautifulSoup(html, "html.parser") - base_url = "https://info-maimai.sega.jp/" - news_items = [] - - news_boxes = soup.select(".maiPager-content .newsBox") - for box in news_boxes: - a_tag = box.select_one("a") - url = urljoin(base_url, a_tag["href"]) if a_tag and a_tag.get("href") else None - - img_tag = box.select_one("img") - image_url = urljoin(base_url, img_tag["src"]) if img_tag else None - - date_tag = box.select_one(".newsDate") - raw_date = date_tag.get_text(strip=True) if date_tag else None - - jst = timezone(timedelta(hours=9)) - try: - dt = datetime.strptime(raw_date.split(" ")[0], "%Y.%m.%d").replace(tzinfo=jst) - timestamp = int(dt.timestamp()) - except: - dt = None - timestamp = 0 - - content_tag = box.select_one(".newsLink") - content = content_tag.get_text(strip=True) if content_tag else None - - news_items.append({ - "date": raw_date, - "identifier": identifier, - "type": None, - "timestamp": timestamp, - "headline": None, - "content": content, - "url": url, - 'is_ai_summary': False, - "images": [{ - "image": image_url, - "link": url - }] if image_url else [] - }) - - return news_items - if parser == ParserVersion.ALPHA: - return alpha_parser - -parse_maimaidx_jp_news_site = make_maimaidx_jpn_parser("MAIMAIDX_JP", ParserVersion.ALPHA) + + +def parse_maimaidx_jp_news_site(html: str): + """ + Confirmed on: + PRISM PLUS + """ + identifier = "MAIMAIDX_JP" + soup = BeautifulSoup(html, "html.parser") + base_url = "https://info-maimai.sega.jp/" + news_items = [] + + news_boxes = soup.select(".maiPager-content .newsBox") + for box in news_boxes: + a_tag = box.select_one("a") + url = urljoin(base_url, a_tag["href"]) if a_tag and a_tag.get("href") else None + + img_tag = box.select_one("img") + image_url = urljoin(base_url, img_tag["src"]) if img_tag else None + + date_tag = box.select_one(".newsDate") + raw_date = date_tag.get_text(strip=True) if date_tag else None + + jst = timezone(timedelta(hours=9)) + try: + dt = datetime.strptime(raw_date.split(" ")[0], "%Y.%m.%d").replace(tzinfo=jst) + timestamp = int(dt.timestamp()) + except Exception: + dt = None + timestamp = 0 + + content_tag = box.select_one(".newsLink") + content = content_tag.get_text(strip=True) if content_tag else None + + news_items.append({ + "date": raw_date, + "identifier": identifier, + "type": None, + "timestamp": timestamp, + "headline": None, + "content": content, + "url": url, + "is_ai_summary": False, + "images": [{ + "image": image_url, + "link": url + }] if image_url else [] + }) + + return news_items \ No newline at end of file diff --git a/sega/ongeki_jp.py b/sega/ongeki_jp.py index f9c2dc4..c173189 100644 --- a/sega/ongeki_jp.py +++ b/sega/ongeki_jp.py @@ -1,68 +1,58 @@ -import time -from datetime import datetime -from enum import Enum +from datetime import datetime, timezone, timedelta from bs4 import BeautifulSoup - -class ParserVersion(Enum): - ALPHA = 1 - - -def make_ongeki_parser(identifier: str, parser: ParserVersion): - def alpha_parser(html: str): - soup = BeautifulSoup(html, "html.parser") - items = [] - - for li in soup.select("li.p-news__listChild"): - a_tag = li.select_one("a.p-news__listLink") - url = a_tag["href"] if a_tag else None - - img_tag = li.select_one(".p-news__listThumb img") - image_url = img_tag["src"] if img_tag else None - image_alt = img_tag["alt"] if img_tag else "" - image_link = url if image_url else None - - date_type_text = li.select_one(".p-news__listTextUpper") - date_text = ( - date_type_text.text.strip().split("/")[0].strip() - if date_type_text - else None - ) - type_text = ( - date_type_text.text.strip().split("/")[-1].strip() - if "/" in date_type_text.text - else None - ) - - timestamp = None - if date_text: - try: - dt = datetime.strptime(date_text, "%Y.%m.%d %a") - timestamp = int(time.mktime(dt.timetuple())) - except: - timestamp = None - - entry = { - "date": date_text, - "identifier": identifier, - "type": type_text if type_text not in ["GAME", "CARDMAKER"] else None, - "timestamp": timestamp, - "headline": None, - "content": image_alt, - "url": url, - "is_ai_summary": False, - "images": [{"image": image_url, "link": image_link}] - if image_url - else [], - } - - items.append(entry) - - return items - - if parser == ParserVersion.ALPHA: - return alpha_parser - - -parse_ongeki_news_site = make_ongeki_parser("ONGEKI_JPN", ParserVersion.ALPHA) +JST = timezone(timedelta(hours=9)) + + +def parse_ongeki_news_site(html: str): + identifier = "ONGEKI_JPN" + soup = BeautifulSoup(html, "html.parser") + items = [] + + for li in soup.select("li.p-news__listChild"): + a_tag = li.select_one("a.p-news__listLink") + url = a_tag["href"] if a_tag else None + + img_tag = li.select_one(".p-news__listThumb img") + image_url = img_tag["src"] if img_tag else None + image_alt = img_tag["alt"] if img_tag else "" + image_link = url if image_url else None + + date_type_text = li.select_one(".p-news__listTextUpper") + date_text = ( + date_type_text.text.strip().split("/")[0].strip() + if date_type_text + else None + ) + type_text = ( + date_type_text.text.strip().split("/")[-1].strip() + if date_type_text and "/" in date_type_text.text + else None + ) + + timestamp = None + if date_text: + try: + dt = datetime.strptime(date_text, "%Y.%m.%d %a").replace(tzinfo=JST) + timestamp = int(dt.timestamp()) + except Exception: + timestamp = None + + entry = { + "date": date_text, + "identifier": identifier, + "type": type_text if type_text not in ["GAME", "CARDMAKER"] else None, + "timestamp": timestamp, + "headline": None, + "content": image_alt, + "url": url, + "is_ai_summary": False, + "images": [{"image": image_url, "link": image_link}] + if image_url + else [], + } + + items.append(entry) + + return items \ No newline at end of file diff --git a/taito/__init__.py b/taito/__init__.py new file mode 100644 index 0000000..bc55d25 --- /dev/null +++ b/taito/__init__.py @@ -0,0 +1,7 @@ +from taito.music_diver import parse_music_diver_news_json +from taito.street_fighter import parse_sf_news_site + +__all__ = [ + "parse_music_diver_news_json", + "parse_sf_news_site", +] \ No newline at end of file diff --git a/taito/music_diver.py b/taito/music_diver.py index 5469ad5..efab0b0 100644 --- a/taito/music_diver.py +++ b/taito/music_diver.py @@ -52,6 +52,7 @@ def parse_music_diver_news_json(data_str: str): "headline": post["title"], "content": content, "url": None, - "images": images + "images": images, + "is_ai_summary": False }) return news_posts diff --git a/taito/street_fighter.py b/taito/street_fighter.py index 987b72b..bf58090 100644 --- a/taito/street_fighter.py +++ b/taito/street_fighter.py @@ -3,15 +3,12 @@ from bs4 import BeautifulSoup import re from datetime import datetime from urllib.parse import urljoin -from enum import Enum from constants import STREET_FIGHTER_NEWS_SITE import requests import base64 IMAGE_LIMIT = 10 # only allow 10 images to be processed as b64 is expensive to store -class ParserVersion(Enum): - ALPHA = 1 def _convert_image_to_base64(img_url: str): headers = { @@ -26,81 +23,72 @@ def _convert_image_to_base64(img_url: str): else: raise Exception(f"Failed to fetch image from URL: {img_url}, status code: {response.status_code}") -def make_sf_parser(identifier: str, parser: ParserVersion): - def alpha_parser(html: str): - soup = BeautifulSoup(html, "html.parser") - news_entries = [] - img_processed = 0 - news_links = soup.find_all('a', class_='btn_latestnews') - for link in news_links: - try: - url = link.get('href', '') - if url.startswith('/'): - url = urljoin(STREET_FIGHTER_NEWS_SITE, url) - info_p = link.find('p', class_='info_list_event') - if not info_p: - continue - date_span = info_p.find('span', class_='latestnews_date') - if not date_span: - continue - date_text = date_span.get_text(strip=True) - date_match = re.match(r'(\d{4}-\d{2}-\d{2})\s+(\d{2}:\d{2})\s* (.+)', date_text) - if not date_match: - continue - date_str = date_match.group(1) - time_str = date_match.group(2) - datetime_str = f"{date_str} {time_str}" - try: - post_date = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M") - timestamp = int(post_date.timestamp()) - except ValueError: - continue - headline_span = info_p.find('span', class_='info_list_txt') - headline = headline_span.get_text(strip=True) if headline_span else "" - headline = re.sub(r'', ' ', headline) - headline = re.sub(r'\s+', ' ', headline).strip() - images = [] - img_div = link.find('div', class_='image') - if img_div: - img_tag = img_div.find('img') - if img_tag: - img_src = img_tag.get('src', '') - if img_src.startswith('/'): - img_src = urljoin('https://sf6ta.jp', img_src) - if img_processed <= IMAGE_LIMIT: - try: - img_b64 = _convert_image_to_base64(img_src) - images.append({ - 'image': img_b64, - 'link': url - }) - except Exception: - pass # Failed likely due to 403. Just show no images in that case - img_processed += 1 - news_entry = { - 'date': post_date.strftime("%Y-%m-%d %H:%M"), - 'identifier': identifier, - 'type': None, - 'timestamp': timestamp, - 'headline': None, - 'content': headline, # content should be prio-ed over headline - 'url': url, - 'images': images, - 'is_ai_summary': False - } - news_entries.append(news_entry) - except Exception as e: +def parse_sf_news_site(html: str): + identifier = "STREET_FIGHTER" + soup = BeautifulSoup(html, "html.parser") + news_entries = [] + img_processed = 0 + news_links = soup.find_all('a', class_='btn_latestnews') + for link in news_links: + try: + url = link.get('href', '') + if url.startswith('/'): + url = urljoin(STREET_FIGHTER_NEWS_SITE, url) + info_p = link.find('p', class_='info_list_event') + if not info_p: continue + date_span = info_p.find('span', class_='latestnews_date') + if not date_span: + continue + date_text = date_span.get_text(strip=True) + date_match = re.match(r'(\d{4}-\d{2}-\d{2})\s+(\d{2}:\d{2})\s* (.+)', date_text) + if not date_match: + continue + date_str = date_match.group(1) + time_str = date_match.group(2) + datetime_str = f"{date_str} {time_str}" + try: + post_date = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M") + timestamp = int(post_date.timestamp()) + except ValueError: + continue + headline_span = info_p.find('span', class_='info_list_txt') + headline = headline_span.get_text(strip=True) if headline_span else "" + headline = re.sub(r'', ' ', headline) + headline = re.sub(r'\s+', ' ', headline).strip() + images = [] + img_div = link.find('div', class_='image') + if img_div: + img_tag = img_div.find('img') + if img_tag: + img_src = img_tag.get('src', '') + if img_src.startswith('/'): + img_src = urljoin('https://sf6ta.jp', img_src) + if img_processed <= IMAGE_LIMIT: + try: + img_b64 = _convert_image_to_base64(img_src) + images.append({ + 'image': img_b64, + 'link': url + }) + except Exception: + pass # Failed likely due to 403. Just show no images in that case + img_processed += 1 + news_entry = { + 'date': post_date.strftime("%Y-%m-%d %H:%M"), + 'identifier': identifier, + 'type': None, + 'timestamp': timestamp, + 'headline': None, + 'content': headline, # content should be prio-ed over headline + 'url': url, + 'images': images, + 'is_ai_summary': False + } + news_entries.append(news_entry) - return news_entries - - if parser == ParserVersion.ALPHA: - return alpha_parser - else: - raise ValueError("Unknown Parser Version") - + except Exception: + continue -parse_sf_news_site = make_sf_parser( - "STREET_FIGHTER", ParserVersion.ALPHA -) + return news_entries \ No newline at end of file -- cgit v1.2.3