diff options
| author | Pinapelz <yukais@pinapelz.com> | 2025-05-13 18:59:08 -0700 |
|---|---|---|
| committer | Pinapelz <yukais@pinapelz.com> | 2025-05-13 18:59:08 -0700 |
| commit | 046a668957f3827a59c9752869a0f7a060c9e79f (patch) | |
| tree | e5a416152da50338a2d196b4cee8e12ad52fbfc7 | |
| parent | c8a90d8f526941f9afe47156ddc4840c0b218fc9 (diff) | |
initial wmmt NA 5dx+ scraper
| -rw-r--r-- | README.md | 2 | ||||
| -rw-r--r-- | bandai_namco/wmmt.py | 96 | ||||
| -rw-r--r-- | constants.py | 13 | ||||
| -rw-r--r-- | news_feed.py | 12 |
4 files changed, 123 insertions, 0 deletions
@@ -17,6 +17,8 @@ Currently Supported: - jubeat (`jubeat_news`) - GITADORA (`gitadora_news`) - NOSTALGIA (`nostalgia_news`) +- DanceRush (`dance_rush_news`) +- DANCE aROUND (`dance_around_news`) - CHUNITHM (JPN) (`chunithm_jpn_news`) - CHUNITHM (INTL) (`chunithm_intl_news`) - maimai DX (JPN) (`maimaidx_jp_news`) diff --git a/bandai_namco/wmmt.py b/bandai_namco/wmmt.py new file mode 100644 index 0000000..86a2ce4 --- /dev/null +++ b/bandai_namco/wmmt.py @@ -0,0 +1,96 @@ +import re +from datetime import datetime, timedelta, timezone +from enum import Enum +from urllib.parse import urljoin +import sys +import os +import pytz +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))) +import constants +from bs4 import BeautifulSoup + +BASE_URL = "https://wanganmaxi-official.com" + +TYPE_MAP = { + "Online Events Information": "EVENTS", + "Update Information": "UPDATE", + "Future Lab News": "FUTURE LAB", + "Special Contents": "SPECIAL" +} + +def make_wmmt_parser(version: constants.WANGAN_MAXI_VERSION): + def five_dx_plus_parser(html: str): + soup = BeautifulSoup(html, "html.parser") + results = [] + for section in soup.select("div.parts_column_02 > div.parts_bg_01"): + type_heading = section.select_one("section h2.parts_txt_01") + type_name = type_heading.get_text(strip=True) if type_heading else None + count = 0 + for a in section.select("ul.archiveNav a[href]"): + if count >= constants.WANGAN_MAXI_POSTS_PER_SECTION: + break + href = a["href"] + title_tag = a.find("h4") + date_tag = a.find("p") + title_parts = [] + for child in title_tag.children: + if child.name == "span": + title_parts.append(f"[{child.get_text(strip=True)}]") + elif isinstance(child, str): + title_parts.append(child.strip()) + title = " ".join(title_parts).strip() + date = date_tag.get_text(strip=True) if date_tag else "No date" + url = urljoin(BASE_URL, href) + url = url.replace(".php", ".html") + results.append({ + "url": url, + "title": title, + "date": date, + "type": TYPE_MAP[type_name] + }) + count += 1 + return results + if version == constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS: + return five_dx_plus_parser + + +def make_wmmt_news_extractor(identifier: str, version: constants.WANGAN_MAXI_VERSION, internal_path: str): + def five_dx_plus_extractor(html: str, data: dict): + image_base = BASE_URL + "/" + internal_path + soup = BeautifulSoup(html, "html.parser") + container = soup.select_one(".parts_inner_01") + if not container: + return None + date_str = data["date"] + timestamp = int(datetime.strptime(date_str, "%Y/%m/%d").replace(tzinfo=timezone.utc).timestamp()) + first_p = container.find("p") + content = first_p.get_text(" ", strip=True) if first_p else "" + images = [] + for img in container.find_all("img"): + src = img.get("src").replace("./","") + if data["type"] == "EVENTS": + src = "event/online/" + src + elif data["type"] == "SPECIAL": + src = "special/" + src + elif data["type"] == "FUTURE LAB": + src = "miraiken/" + src + elif data["type"] == "UPDATE": + src = "update/" + src + img_url = image_base + "/" + src if src else None + parent = img.find_parent("a") + images.append({ + "image": img_url, + "link": urljoin(BASE_URL, parent.get("href")) if parent and parent.get("href") else None + }) + data["identifier"] = identifier + data["timestamp"] = timestamp + data["content"] = content + data["images"] = images + data["is_ai_summary"] = False + return data + + if version == constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS: + return five_dx_plus_extractor + +get_wmmt_na_news_post_links = make_wmmt_parser(constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS) +parse_wmmt_na_news = make_wmmt_news_extractor("WANGAN_MAXI_NA", constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS, "wanganmaxi5dxplus/na") diff --git a/constants.py b/constants.py index e990f87..e6b7e57 100644 --- a/constants.py +++ b/constants.py @@ -27,6 +27,12 @@ ONGEKI_JP_NEWS_SITE="https://info-ongeki.sega.jp/" MUSIC_DIVER_NEWS="https://mypage.musicdiver.jp/api/news?lang=en" TAIKO_BLOG_SITE="https://taiko-ch.net/blog/" +WANGAN_MAXI_GENERIC="https://wanganmaxi-official.com/" +WANGAN_MAXI_NA_NEWS_SITE="https://wanganmaxi-official.com/wanganmaxi5dxplus/na/archive" +WANGAN_MAXI_ASIA_OCE_NEWS_SITE="https://wanganmaxi-official.com/wanganmaxi6rr/en/archive/" +WANGAN_MAXI_JP_NEWS_SITE="https://wanganmaxi-official.com/wanganmaxi6rrplus/jp/archive/" +WANGAN_MAXI_POSTS_PER_SECTION=3 +# due to how dead the NA version is, these will be merged into a singular feed ADD_EN_TRANSLATION=True # Only takes effect if an API key is provided in .env CHUNI_RECURSIVE_IMAGE=True # Scrape the individual post pages and get all images there @@ -45,3 +51,10 @@ class MAIMAIDX_VERSION(Enum): class ONGEKI_VERSION(Enum): REFRESH = 1 + +class WANGAN_MAXI_VERSION(Enum): + FIVE_DX_PLUS = 1, + SIX = 2, + SIX_R = 3, + SIX_RR = 4, + SIX_RR_PLUS = 5 diff --git a/news_feed.py b/news_feed.py index 0a49707..d621984 100644 --- a/news_feed.py +++ b/news_feed.py @@ -32,6 +32,7 @@ import sega.maimaidx_intl as maimaidx_intl import sega.ongeki_jp as ongeki_jp import taito.music_diver as music_diver import bandai_namco.taiko as taiko +import bandai_namco.wmmt as wmmt import community.disc as disc import community.wacca_plus.wacca_plus as wac_plus import community.museca_plus as mus_plus @@ -166,6 +167,17 @@ def get_news(news_url: str, version=None) -> list: news_posts = sorted(taiko.parse_taiko_blog_site(site_data), key=lambda x: x['timestamp'], reverse=True) news_posts = translate.add_translate_text_to_en(news_posts) + elif news_url == constants.WANGAN_MAXI_GENERIC: + news_posts = [] + na_site_data = download_site_as_html(constants.WANGAN_MAXI_NA_NEWS_SITE) + prelim_na_news_data = wmmt.get_wmmt_na_news_post_links(na_site_data) + for data in prelim_na_news_data: + post_site_data = download_site_as_html(data["url"]) + news_posts.append(wmmt.parse_wmmt_na_news(post_site_data, data)) + print(news_posts) + exit() + + elif news_url == constants.WACCA_PLUS_MAGIC_STRING: if not wac_plus.check_is_generation_possible(): news_posts = [] |
