aboutsummaryrefslogtreecommitdiffstats
path: root/bandai_namco
diff options
context:
space:
mode:
authorPinapelz <yukais@pinapelz.com>2025-05-13 18:59:08 -0700
committerPinapelz <yukais@pinapelz.com>2025-05-13 18:59:08 -0700
commit046a668957f3827a59c9752869a0f7a060c9e79f (patch)
treee5a416152da50338a2d196b4cee8e12ad52fbfc7 /bandai_namco
parentc8a90d8f526941f9afe47156ddc4840c0b218fc9 (diff)
initial wmmt NA 5dx+ scraper
Diffstat (limited to 'bandai_namco')
-rw-r--r--bandai_namco/wmmt.py96
1 files changed, 96 insertions, 0 deletions
diff --git a/bandai_namco/wmmt.py b/bandai_namco/wmmt.py
new file mode 100644
index 0000000..86a2ce4
--- /dev/null
+++ b/bandai_namco/wmmt.py
@@ -0,0 +1,96 @@
+import re
+from datetime import datetime, timedelta, timezone
+from enum import Enum
+from urllib.parse import urljoin
+import sys
+import os
+import pytz
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
+import constants
+from bs4 import BeautifulSoup
+
+BASE_URL = "https://wanganmaxi-official.com"
+
+TYPE_MAP = {
+ "Online Events Information": "EVENTS",
+ "Update Information": "UPDATE",
+ "Future Lab News": "FUTURE LAB",
+ "Special Contents": "SPECIAL"
+}
+
+def make_wmmt_parser(version: constants.WANGAN_MAXI_VERSION):
+ def five_dx_plus_parser(html: str):
+ soup = BeautifulSoup(html, "html.parser")
+ results = []
+ for section in soup.select("div.parts_column_02 > div.parts_bg_01"):
+ type_heading = section.select_one("section h2.parts_txt_01")
+ type_name = type_heading.get_text(strip=True) if type_heading else None
+ count = 0
+ for a in section.select("ul.archiveNav a[href]"):
+ if count >= constants.WANGAN_MAXI_POSTS_PER_SECTION:
+ break
+ href = a["href"]
+ title_tag = a.find("h4")
+ date_tag = a.find("p")
+ title_parts = []
+ for child in title_tag.children:
+ if child.name == "span":
+ title_parts.append(f"[{child.get_text(strip=True)}]")
+ elif isinstance(child, str):
+ title_parts.append(child.strip())
+ title = " ".join(title_parts).strip()
+ date = date_tag.get_text(strip=True) if date_tag else "No date"
+ url = urljoin(BASE_URL, href)
+ url = url.replace(".php", ".html")
+ results.append({
+ "url": url,
+ "title": title,
+ "date": date,
+ "type": TYPE_MAP[type_name]
+ })
+ count += 1
+ return results
+ if version == constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS:
+ return five_dx_plus_parser
+
+
+def make_wmmt_news_extractor(identifier: str, version: constants.WANGAN_MAXI_VERSION, internal_path: str):
+ def five_dx_plus_extractor(html: str, data: dict):
+ image_base = BASE_URL + "/" + internal_path
+ soup = BeautifulSoup(html, "html.parser")
+ container = soup.select_one(".parts_inner_01")
+ if not container:
+ return None
+ date_str = data["date"]
+ timestamp = int(datetime.strptime(date_str, "%Y/%m/%d").replace(tzinfo=timezone.utc).timestamp())
+ first_p = container.find("p")
+ content = first_p.get_text(" ", strip=True) if first_p else ""
+ images = []
+ for img in container.find_all("img"):
+ src = img.get("src").replace("./","")
+ if data["type"] == "EVENTS":
+ src = "event/online/" + src
+ elif data["type"] == "SPECIAL":
+ src = "special/" + src
+ elif data["type"] == "FUTURE LAB":
+ src = "miraiken/" + src
+ elif data["type"] == "UPDATE":
+ src = "update/" + src
+ img_url = image_base + "/" + src if src else None
+ parent = img.find_parent("a")
+ images.append({
+ "image": img_url,
+ "link": urljoin(BASE_URL, parent.get("href")) if parent and parent.get("href") else None
+ })
+ data["identifier"] = identifier
+ data["timestamp"] = timestamp
+ data["content"] = content
+ data["images"] = images
+ data["is_ai_summary"] = False
+ return data
+
+ if version == constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS:
+ return five_dx_plus_extractor
+
+get_wmmt_na_news_post_links = make_wmmt_parser(constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS)
+parse_wmmt_na_news = make_wmmt_news_extractor("WANGAN_MAXI_NA", constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS, "wanganmaxi5dxplus/na")
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage