aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--bandai_namco/wmmt.py142
-rw-r--r--generate.py5
-rw-r--r--news_feed.py25
-rw-r--r--site_scraper.py4
4 files changed, 168 insertions, 8 deletions
diff --git a/bandai_namco/wmmt.py b/bandai_namco/wmmt.py
index 86a2ce4..eaa4767 100644
--- a/bandai_namco/wmmt.py
+++ b/bandai_namco/wmmt.py
@@ -15,7 +15,15 @@ TYPE_MAP = {
"Online Events Information": "EVENTS",
"Update Information": "UPDATE",
"Future Lab News": "FUTURE LAB",
- "Special Contents": "SPECIAL"
+ "Special Contents": "SPECIAL",
+ "Navi Scratch-off Item": "NAVI-SCRATCH",
+ "News": "NEWS",
+ "オンラインイベント情報": "EVENTS",
+ "アップデート情報": "UPDATE",
+ "未来研通信": "FUTURE LAB",
+ "スペシャルコンテンツ": "SPECIAL",
+ "ナビスクラッチ配信アイテム": "NAVI-SCRATCH",
+ "ニュース": "NEWS"
}
def make_wmmt_parser(version: constants.WANGAN_MAXI_VERSION):
@@ -46,15 +54,72 @@ def make_wmmt_parser(version: constants.WANGAN_MAXI_VERSION):
"url": url,
"title": title,
"date": date,
- "type": TYPE_MAP[type_name]
+ "type": TYPE_MAP.get(type_name, "Unknown")
})
count += 1
return results
+
+ def six_rr_parser(html: str):
+ soup = BeautifulSoup(html, "html.parser")
+ results = []
+ for section in soup.select("div.parts_column_02 > div.parts_bg_01"):
+ type_heading = section.select_one("section h2.parts_txt_01")
+ type_name = type_heading.get_text(strip=True) if type_heading else None
+ count = 0
+ for a in section.select("ul.archiveNav a[href]"):
+ if count >= constants.WANGAN_MAXI_POSTS_PER_SECTION:
+ break
+ href = a["href"]
+ date_tag = a.find("p")
+ title_tag = a.find("h4")
+ title = title_tag.get_text(strip=True) if title_tag else "No title"
+ date = date_tag.get_text(strip=True) if date_tag else "No date"
+ url = urljoin(BASE_URL, href)
+ url = url.replace(".php", ".html")
+ results.append({
+ "url": url,
+ "title": title,
+ "date": date,
+ "type": TYPE_MAP.get(type_name, "Unknown")
+ })
+ count += 1
+ return results
+
+ def six_rr_plus_parser(html: str):
+ soup = BeautifulSoup(html, "html.parser")
+ results = []
+ for section in soup.select("div.parts_column_02 > div.parts_bg_01"):
+ type_heading = section.select_one("section h2.parts_txt_01")
+ type_name = type_heading.get_text(strip=True) if type_heading else None
+ count = 0
+ for a in section.select("ul.archiveNav a[href]"):
+ if count >= constants.WANGAN_MAXI_POSTS_PER_SECTION:
+ break
+ href = a["href"]
+ date_tag = a.find("p")
+ title_tag = a.find("h4")
+ title = title_tag.get_text(strip=True) if title_tag else "No title"
+ date = date_tag.get_text(strip=True) if date_tag else "No date"
+ url = urljoin(BASE_URL, href)
+ url = url.replace(".php", ".html")
+ results.append({
+ "url": url,
+ "title": title,
+ "date": date,
+ "type": TYPE_MAP.get(type_name, "Unknown")
+ })
+ count += 1
+ return results
+
if version == constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS:
return five_dx_plus_parser
+ elif version == constants.WANGAN_MAXI_VERSION.SIX_RR:
+ return six_rr_parser
+ elif version == constants.WANGAN_MAXI_VERSION.SIX_RR_PLUS:
+ return six_rr_plus_parser
-def make_wmmt_news_extractor(identifier: str, version: constants.WANGAN_MAXI_VERSION, internal_path: str):
+def make_wmmt_news_extractor(identifier: str, version: constants.WANGAN_MAXI_VERSION, internal_path: str, region_text: str):
def five_dx_plus_extractor(html: str, data: dict):
image_base = BASE_URL + "/" + internal_path
soup = BeautifulSoup(html, "html.parser")
@@ -82,6 +147,67 @@ def make_wmmt_news_extractor(identifier: str, version: constants.WANGAN_MAXI_VER
"image": img_url,
"link": urljoin(BASE_URL, parent.get("href")) if parent and parent.get("href") else None
})
+ data["type"] = "["+region_text+"]" + " " + data["type"]
+ data["identifier"] = identifier
+ data["timestamp"] = timestamp
+ data["content"] = content
+ data["images"] = images
+ data["is_ai_summary"] = False
+ return data
+
+ def six_rr_extractor(html: str, data: dict):
+ image_base = BASE_URL + "/" + internal_path
+ soup = BeautifulSoup(html, "html.parser")
+ container = soup.select_one(".parts_column_02")
+ if not container:
+ return None
+ date_str = data["date"]
+ timestamp = int(datetime.strptime(date_str, "%Y/%m/%d").replace(tzinfo=timezone.utc).timestamp())
+ first_p = container.find("p")
+ content = first_p.get_text(" ", strip=True) if first_p else ""
+ images = []
+ for img in container.select("img"):
+ src = img.get("src")
+ if not src:
+ continue
+ src = src.replace("./", "").lstrip("/")
+ img_url = f"{image_base}/{src}"
+ parent = img.find_parent("a")
+ images.append({
+ "image": img_url,
+ "link": urljoin(BASE_URL, parent.get("href")) if parent and parent.get("href") else None
+ })
+ data["type"] = "["+region_text+"]" + " " + data["type"]
+ data["identifier"] = identifier
+ data["timestamp"] = timestamp
+ data["content"] = content
+ data["images"] = images
+ data["is_ai_summary"] = False
+ return data
+
+ def six_rr_plus_extractor(html: str, data: dict):
+ image_base = BASE_URL + "/" + internal_path
+ soup = BeautifulSoup(html, "html.parser")
+ container = soup.select_one(".parts_column_02")
+ if not container:
+ return None
+ date_str = data["date"]
+ timestamp = int(datetime.strptime(date_str, "%Y/%m/%d").replace(tzinfo=timezone.utc).timestamp())
+ first_p = container.find("p")
+ content = first_p.get_text(" ", strip=True) if first_p else ""
+ images = []
+ for img in container.select("img"):
+ src = img.get("src")
+ if not src:
+ continue
+ src = src.replace("./", "").lstrip("/")
+ img_url = f"{image_base}/{src}"
+ parent = img.find_parent("a")
+ images.append({
+ "image": img_url,
+ "link": urljoin(BASE_URL, parent.get("href")) if parent and parent.get("href") else None
+ })
+ data["type"] = "["+region_text+"]" + " " + data["type"]
data["identifier"] = identifier
data["timestamp"] = timestamp
data["content"] = content
@@ -91,6 +217,14 @@ def make_wmmt_news_extractor(identifier: str, version: constants.WANGAN_MAXI_VER
if version == constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS:
return five_dx_plus_extractor
+ elif version == constants.WANGAN_MAXI_VERSION.SIX_RR:
+ return six_rr_extractor
+ elif version == constants.WANGAN_MAXI_VERSION.SIX_RR_PLUS:
+ return six_rr_plus_extractor
get_wmmt_na_news_post_links = make_wmmt_parser(constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS)
-parse_wmmt_na_news = make_wmmt_news_extractor("WANGAN_MAXI_NA", constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS, "wanganmaxi5dxplus/na")
+get_wmmt_asia_oce_news_post_links = make_wmmt_parser(constants.WANGAN_MAXI_VERSION.SIX_RR)
+get_wmmt_jp_news_post_links = make_wmmt_parser(constants.WANGAN_MAXI_VERSION.SIX_RR_PLUS)
+parse_wmmt_na_news = make_wmmt_news_extractor("WANGAN_MAXI_NA", constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS, "wanganmaxi5dxplus/na", "NA")
+parse_wmmt_asia_oce_news = make_wmmt_news_extractor("WANGAN_MAXI_ASIA_OCE", constants.WANGAN_MAXI_VERSION.SIX_RR, "wanganmaxi6rr/en", "ASIA/OCE")
+parse_wmmt_jp_news = make_wmmt_news_extractor("WANGAN_MAXI_JP", constants.WANGAN_MAXI_VERSION.SIX_RR_PLUS, "wanganmaxi6rrplus/jp", "JPN")
diff --git a/generate.py b/generate.py
index a6ef392..fec2295 100644
--- a/generate.py
+++ b/generate.py
@@ -128,6 +128,9 @@ def generate_music_diver_news_file():
def generate_taiko_news_file():
return generate_news_file("taiko_news", constants.TAIKO_BLOG_SITE)
+def generate_wmmt_news_file():
+ return generate_news_file("wmmt_news", constants.WANGAN_MAXI_GENERIC)
+
def generate_wacca_plus_news_file():
return generate_news_file("wacca_plus_news", constants.WACCA_PLUS_MAGIC_STRING)
@@ -162,6 +165,7 @@ if __name__ == "__main__":
wacca_plus_news = generate_wacca_plus_news_file()
museca_plus_news = generate_museca_plus_news_file()
generate_rbdx_plus_news_file()
+ wmmt_news = generate_wmmt_news_file()
@@ -180,6 +184,7 @@ if __name__ == "__main__":
chunithm_intl_news_data,
music_diver_news_data,
taiko_news_data,
+ wmmt_news,
wacca_plus_news,
museca_plus_news,
polaris_news_data,
diff --git a/news_feed.py b/news_feed.py
index d621984..e6e125e 100644
--- a/news_feed.py
+++ b/news_feed.py
@@ -173,9 +173,28 @@ def get_news(news_url: str, version=None) -> list:
prelim_na_news_data = wmmt.get_wmmt_na_news_post_links(na_site_data)
for data in prelim_na_news_data:
post_site_data = download_site_as_html(data["url"])
- news_posts.append(wmmt.parse_wmmt_na_news(post_site_data, data))
- print(news_posts)
- exit()
+ news = wmmt.parse_wmmt_na_news(post_site_data, data)
+ if news is not None:
+ news_posts.append(news)
+ asia_oce_site_data = download_site_as_html(constants.WANGAN_MAXI_ASIA_OCE_NEWS_SITE)
+ prelim_asia_oce_news_data = wmmt.get_wmmt_asia_oce_news_post_links(asia_oce_site_data)
+ for data in prelim_asia_oce_news_data:
+ post_site_data = download_site_as_html(data["url"])
+ news = wmmt.parse_wmmt_asia_oce_news(post_site_data, data)
+ if news is not None:
+ news_posts.append(news)
+ jp_site_data = download_site_as_html(constants.WANGAN_MAXI_JP_NEWS_SITE, response_encoding="utf-8")
+ prelim_jp_news_data = wmmt.get_wmmt_jp_news_post_links(jp_site_data)
+ jp_news = []
+ for data in prelim_jp_news_data:
+ post_site_data = download_site_as_html(data["url"], response_encoding="utf-8")
+ news = wmmt.parse_wmmt_jp_news(post_site_data, data)
+ if news is not None:
+ jp_news.append(news)
+ jp_news = translate.add_translate_text_to_en(jp_news)
+ news_posts.extend(jp_news)
+ news_posts = sorted(news_posts, key=lambda x: x['timestamp'], reverse=True)
+ return news_posts
elif news_url == constants.WACCA_PLUS_MAGIC_STRING:
diff --git a/site_scraper.py b/site_scraper.py
index 9efa4b6..e9301b5 100644
--- a/site_scraper.py
+++ b/site_scraper.py
@@ -68,7 +68,7 @@ class SiteScraper:
print("WebDriver closed successfully")
-def download_site_as_html(url: str, timeout: int = 10) -> str:
+def download_site_as_html(url: str, timeout: int = 10, response_encoding=None) -> str:
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
@@ -84,6 +84,8 @@ def download_site_as_html(url: str, timeout: int = 10) -> str:
try:
response = requests.get(url, headers=headers, timeout=timeout)
+ if response_encoding:
+ response.encoding = response_encoding
response.raise_for_status()
return response.text
except requests.RequestException as e:
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage