aboutsummaryrefslogtreecommitdiffstats
path: root/bandai_namco
diff options
context:
space:
mode:
authorPinapelz <yukais@pinapelz.com>2025-05-14 15:29:14 -0700
committerPinapelz <yukais@pinapelz.com>2025-05-14 15:29:14 -0700
commita98cbc880c6e2a862ee1a1716fe522410f91ccd9 (patch)
treea6d8f6160472e4c7b49426d0a6e670a860ca15c3 /bandai_namco
parent046a668957f3827a59c9752869a0f7a060c9e79f (diff)
wmmt: 6rr and 6rr+ scraper
Diffstat (limited to 'bandai_namco')
-rw-r--r--bandai_namco/wmmt.py142
1 files changed, 138 insertions, 4 deletions
diff --git a/bandai_namco/wmmt.py b/bandai_namco/wmmt.py
index 86a2ce4..eaa4767 100644
--- a/bandai_namco/wmmt.py
+++ b/bandai_namco/wmmt.py
@@ -15,7 +15,15 @@ TYPE_MAP = {
"Online Events Information": "EVENTS",
"Update Information": "UPDATE",
"Future Lab News": "FUTURE LAB",
- "Special Contents": "SPECIAL"
+ "Special Contents": "SPECIAL",
+ "Navi Scratch-off Item": "NAVI-SCRATCH",
+ "News": "NEWS",
+ "オンラインイベント情報": "EVENTS",
+ "アップデート情報": "UPDATE",
+ "未来研通信": "FUTURE LAB",
+ "スペシャルコンテンツ": "SPECIAL",
+ "ナビスクラッチ配信アイテム": "NAVI-SCRATCH",
+ "ニュース": "NEWS"
}
def make_wmmt_parser(version: constants.WANGAN_MAXI_VERSION):
@@ -46,15 +54,72 @@ def make_wmmt_parser(version: constants.WANGAN_MAXI_VERSION):
"url": url,
"title": title,
"date": date,
- "type": TYPE_MAP[type_name]
+ "type": TYPE_MAP.get(type_name, "Unknown")
})
count += 1
return results
+
+ def six_rr_parser(html: str):
+ soup = BeautifulSoup(html, "html.parser")
+ results = []
+ for section in soup.select("div.parts_column_02 > div.parts_bg_01"):
+ type_heading = section.select_one("section h2.parts_txt_01")
+ type_name = type_heading.get_text(strip=True) if type_heading else None
+ count = 0
+ for a in section.select("ul.archiveNav a[href]"):
+ if count >= constants.WANGAN_MAXI_POSTS_PER_SECTION:
+ break
+ href = a["href"]
+ date_tag = a.find("p")
+ title_tag = a.find("h4")
+ title = title_tag.get_text(strip=True) if title_tag else "No title"
+ date = date_tag.get_text(strip=True) if date_tag else "No date"
+ url = urljoin(BASE_URL, href)
+ url = url.replace(".php", ".html")
+ results.append({
+ "url": url,
+ "title": title,
+ "date": date,
+ "type": TYPE_MAP.get(type_name, "Unknown")
+ })
+ count += 1
+ return results
+
+ def six_rr_plus_parser(html: str):
+ soup = BeautifulSoup(html, "html.parser")
+ results = []
+ for section in soup.select("div.parts_column_02 > div.parts_bg_01"):
+ type_heading = section.select_one("section h2.parts_txt_01")
+ type_name = type_heading.get_text(strip=True) if type_heading else None
+ count = 0
+ for a in section.select("ul.archiveNav a[href]"):
+ if count >= constants.WANGAN_MAXI_POSTS_PER_SECTION:
+ break
+ href = a["href"]
+ date_tag = a.find("p")
+ title_tag = a.find("h4")
+ title = title_tag.get_text(strip=True) if title_tag else "No title"
+ date = date_tag.get_text(strip=True) if date_tag else "No date"
+ url = urljoin(BASE_URL, href)
+ url = url.replace(".php", ".html")
+ results.append({
+ "url": url,
+ "title": title,
+ "date": date,
+ "type": TYPE_MAP.get(type_name, "Unknown")
+ })
+ count += 1
+ return results
+
if version == constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS:
return five_dx_plus_parser
+ elif version == constants.WANGAN_MAXI_VERSION.SIX_RR:
+ return six_rr_parser
+ elif version == constants.WANGAN_MAXI_VERSION.SIX_RR_PLUS:
+ return six_rr_plus_parser
-def make_wmmt_news_extractor(identifier: str, version: constants.WANGAN_MAXI_VERSION, internal_path: str):
+def make_wmmt_news_extractor(identifier: str, version: constants.WANGAN_MAXI_VERSION, internal_path: str, region_text: str):
def five_dx_plus_extractor(html: str, data: dict):
image_base = BASE_URL + "/" + internal_path
soup = BeautifulSoup(html, "html.parser")
@@ -82,6 +147,67 @@ def make_wmmt_news_extractor(identifier: str, version: constants.WANGAN_MAXI_VER
"image": img_url,
"link": urljoin(BASE_URL, parent.get("href")) if parent and parent.get("href") else None
})
+ data["type"] = "["+region_text+"]" + " " + data["type"]
+ data["identifier"] = identifier
+ data["timestamp"] = timestamp
+ data["content"] = content
+ data["images"] = images
+ data["is_ai_summary"] = False
+ return data
+
+ def six_rr_extractor(html: str, data: dict):
+ image_base = BASE_URL + "/" + internal_path
+ soup = BeautifulSoup(html, "html.parser")
+ container = soup.select_one(".parts_column_02")
+ if not container:
+ return None
+ date_str = data["date"]
+ timestamp = int(datetime.strptime(date_str, "%Y/%m/%d").replace(tzinfo=timezone.utc).timestamp())
+ first_p = container.find("p")
+ content = first_p.get_text(" ", strip=True) if first_p else ""
+ images = []
+ for img in container.select("img"):
+ src = img.get("src")
+ if not src:
+ continue
+ src = src.replace("./", "").lstrip("/")
+ img_url = f"{image_base}/{src}"
+ parent = img.find_parent("a")
+ images.append({
+ "image": img_url,
+ "link": urljoin(BASE_URL, parent.get("href")) if parent and parent.get("href") else None
+ })
+ data["type"] = "["+region_text+"]" + " " + data["type"]
+ data["identifier"] = identifier
+ data["timestamp"] = timestamp
+ data["content"] = content
+ data["images"] = images
+ data["is_ai_summary"] = False
+ return data
+
+ def six_rr_plus_extractor(html: str, data: dict):
+ image_base = BASE_URL + "/" + internal_path
+ soup = BeautifulSoup(html, "html.parser")
+ container = soup.select_one(".parts_column_02")
+ if not container:
+ return None
+ date_str = data["date"]
+ timestamp = int(datetime.strptime(date_str, "%Y/%m/%d").replace(tzinfo=timezone.utc).timestamp())
+ first_p = container.find("p")
+ content = first_p.get_text(" ", strip=True) if first_p else ""
+ images = []
+ for img in container.select("img"):
+ src = img.get("src")
+ if not src:
+ continue
+ src = src.replace("./", "").lstrip("/")
+ img_url = f"{image_base}/{src}"
+ parent = img.find_parent("a")
+ images.append({
+ "image": img_url,
+ "link": urljoin(BASE_URL, parent.get("href")) if parent and parent.get("href") else None
+ })
+ data["type"] = "["+region_text+"]" + " " + data["type"]
data["identifier"] = identifier
data["timestamp"] = timestamp
data["content"] = content
@@ -91,6 +217,14 @@ def make_wmmt_news_extractor(identifier: str, version: constants.WANGAN_MAXI_VER
if version == constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS:
return five_dx_plus_extractor
+ elif version == constants.WANGAN_MAXI_VERSION.SIX_RR:
+ return six_rr_extractor
+ elif version == constants.WANGAN_MAXI_VERSION.SIX_RR_PLUS:
+ return six_rr_plus_extractor
get_wmmt_na_news_post_links = make_wmmt_parser(constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS)
-parse_wmmt_na_news = make_wmmt_news_extractor("WANGAN_MAXI_NA", constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS, "wanganmaxi5dxplus/na")
+get_wmmt_asia_oce_news_post_links = make_wmmt_parser(constants.WANGAN_MAXI_VERSION.SIX_RR)
+get_wmmt_jp_news_post_links = make_wmmt_parser(constants.WANGAN_MAXI_VERSION.SIX_RR_PLUS)
+parse_wmmt_na_news = make_wmmt_news_extractor("WANGAN_MAXI_NA", constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS, "wanganmaxi5dxplus/na", "NA")
+parse_wmmt_asia_oce_news = make_wmmt_news_extractor("WANGAN_MAXI_ASIA_OCE", constants.WANGAN_MAXI_VERSION.SIX_RR, "wanganmaxi6rr/en", "ASIA/OCE")
+parse_wmmt_jp_news = make_wmmt_news_extractor("WANGAN_MAXI_JP", constants.WANGAN_MAXI_VERSION.SIX_RR_PLUS, "wanganmaxi6rrplus/jp", "JPN")
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage