bandai_namco/wmmt.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96

import re
from datetime import datetime, timedelta, timezone
from enum import Enum
from urllib.parse import urljoin
import sys
import os
import pytz
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
import constants
from bs4 import BeautifulSoup

BASE_URL = "https://wanganmaxi-official.com"

TYPE_MAP = {
    "Online Events Information": "EVENTS",
    "Update Information": "UPDATE",
    "Future Lab News": "FUTURE LAB",
    "Special Contents": "SPECIAL"
}

def make_wmmt_parser(version: constants.WANGAN_MAXI_VERSION):
    def five_dx_plus_parser(html: str):
        soup = BeautifulSoup(html, "html.parser")
        results = []
        for section in soup.select("div.parts_column_02 > div.parts_bg_01"):
            type_heading = section.select_one("section h2.parts_txt_01")
            type_name = type_heading.get_text(strip=True) if type_heading else None
            count = 0
            for a in section.select("ul.archiveNav a[href]"):
                if count >= constants.WANGAN_MAXI_POSTS_PER_SECTION:
                    break
                href = a["href"]
                title_tag = a.find("h4")
                date_tag = a.find("p")
                title_parts = []
                for child in title_tag.children:
                    if child.name == "span":
                        title_parts.append(f"[{child.get_text(strip=True)}]")
                    elif isinstance(child, str):
                        title_parts.append(child.strip())
                title = " ".join(title_parts).strip()
                date = date_tag.get_text(strip=True) if date_tag else "No date"
                url = urljoin(BASE_URL, href)
                url = url.replace(".php", ".html")
                results.append({
                    "url": url,
                    "title": title,
                    "date": date,
                    "type": TYPE_MAP[type_name]
                })
                count += 1
        return results
    if version == constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS:
        return five_dx_plus_parser


def make_wmmt_news_extractor(identifier: str, version: constants.WANGAN_MAXI_VERSION, internal_path: str):
    def five_dx_plus_extractor(html: str, data: dict):
        image_base = BASE_URL + "/" + internal_path
        soup = BeautifulSoup(html, "html.parser")
        container = soup.select_one(".parts_inner_01")
        if not container:
            return None
        date_str = data["date"]
        timestamp = int(datetime.strptime(date_str, "%Y/%m/%d").replace(tzinfo=timezone.utc).timestamp())
        first_p = container.find("p")
        content = first_p.get_text(" ", strip=True) if first_p else ""
        images = []
        for img in container.find_all("img"):
            src = img.get("src").replace("./","")
            if data["type"] == "EVENTS":
                src = "event/online/" + src
            elif data["type"] == "SPECIAL":
                src =  "special/" + src
            elif data["type"] == "FUTURE LAB":
                src =  "miraiken/" + src
            elif data["type"] == "UPDATE":
                src = "update/" + src
            img_url = image_base + "/" + src if src else None
            parent = img.find_parent("a")
            images.append({
                "image": img_url,
                "link": urljoin(BASE_URL, parent.get("href")) if parent and parent.get("href") else None
            })
        data["identifier"] = identifier
        data["timestamp"] = timestamp
        data["content"] = content
        data["images"] = images
        data["is_ai_summary"] = False
        return data

    if version == constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS:
        return five_dx_plus_extractor

get_wmmt_na_news_post_links = make_wmmt_parser(constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS)
parse_wmmt_na_news = make_wmmt_news_extractor("WANGAN_MAXI_NA", constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS, "wanganmaxi5dxplus/na")