diff options
| -rw-r--r-- | bandai_namco/wmmt.py | 142 | ||||
| -rw-r--r-- | generate.py | 5 | ||||
| -rw-r--r-- | news_feed.py | 25 | ||||
| -rw-r--r-- | site_scraper.py | 4 |
4 files changed, 168 insertions, 8 deletions
diff --git a/bandai_namco/wmmt.py b/bandai_namco/wmmt.py index 86a2ce4..eaa4767 100644 --- a/bandai_namco/wmmt.py +++ b/bandai_namco/wmmt.py @@ -15,7 +15,15 @@ TYPE_MAP = { "Online Events Information": "EVENTS", "Update Information": "UPDATE", "Future Lab News": "FUTURE LAB", - "Special Contents": "SPECIAL" + "Special Contents": "SPECIAL", + "Navi Scratch-off Item": "NAVI-SCRATCH", + "News": "NEWS", + "オンラインイベント情報": "EVENTS", + "アップデート情報": "UPDATE", + "未来研通信": "FUTURE LAB", + "スペシャルコンテンツ": "SPECIAL", + "ナビスクラッチ配信アイテム": "NAVI-SCRATCH", + "ニュース": "NEWS" } def make_wmmt_parser(version: constants.WANGAN_MAXI_VERSION): @@ -46,15 +54,72 @@ def make_wmmt_parser(version: constants.WANGAN_MAXI_VERSION): "url": url, "title": title, "date": date, - "type": TYPE_MAP[type_name] + "type": TYPE_MAP.get(type_name, "Unknown") }) count += 1 return results + + def six_rr_parser(html: str): + soup = BeautifulSoup(html, "html.parser") + results = [] + for section in soup.select("div.parts_column_02 > div.parts_bg_01"): + type_heading = section.select_one("section h2.parts_txt_01") + type_name = type_heading.get_text(strip=True) if type_heading else None + count = 0 + for a in section.select("ul.archiveNav a[href]"): + if count >= constants.WANGAN_MAXI_POSTS_PER_SECTION: + break + href = a["href"] + date_tag = a.find("p") + title_tag = a.find("h4") + title = title_tag.get_text(strip=True) if title_tag else "No title" + date = date_tag.get_text(strip=True) if date_tag else "No date" + url = urljoin(BASE_URL, href) + url = url.replace(".php", ".html") + results.append({ + "url": url, + "title": title, + "date": date, + "type": TYPE_MAP.get(type_name, "Unknown") + }) + count += 1 + return results + + def six_rr_plus_parser(html: str): + soup = BeautifulSoup(html, "html.parser") + results = [] + for section in soup.select("div.parts_column_02 > div.parts_bg_01"): + type_heading = section.select_one("section h2.parts_txt_01") + type_name = type_heading.get_text(strip=True) if type_heading else None + count = 0 + for a in section.select("ul.archiveNav a[href]"): + if count >= constants.WANGAN_MAXI_POSTS_PER_SECTION: + break + href = a["href"] + date_tag = a.find("p") + title_tag = a.find("h4") + title = title_tag.get_text(strip=True) if title_tag else "No title" + date = date_tag.get_text(strip=True) if date_tag else "No date" + url = urljoin(BASE_URL, href) + url = url.replace(".php", ".html") + results.append({ + "url": url, + "title": title, + "date": date, + "type": TYPE_MAP.get(type_name, "Unknown") + }) + count += 1 + return results + if version == constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS: return five_dx_plus_parser + elif version == constants.WANGAN_MAXI_VERSION.SIX_RR: + return six_rr_parser + elif version == constants.WANGAN_MAXI_VERSION.SIX_RR_PLUS: + return six_rr_plus_parser -def make_wmmt_news_extractor(identifier: str, version: constants.WANGAN_MAXI_VERSION, internal_path: str): +def make_wmmt_news_extractor(identifier: str, version: constants.WANGAN_MAXI_VERSION, internal_path: str, region_text: str): def five_dx_plus_extractor(html: str, data: dict): image_base = BASE_URL + "/" + internal_path soup = BeautifulSoup(html, "html.parser") @@ -82,6 +147,67 @@ def make_wmmt_news_extractor(identifier: str, version: constants.WANGAN_MAXI_VER "image": img_url, "link": urljoin(BASE_URL, parent.get("href")) if parent and parent.get("href") else None }) + data["type"] = "["+region_text+"]" + " " + data["type"] + data["identifier"] = identifier + data["timestamp"] = timestamp + data["content"] = content + data["images"] = images + data["is_ai_summary"] = False + return data + + def six_rr_extractor(html: str, data: dict): + image_base = BASE_URL + "/" + internal_path + soup = BeautifulSoup(html, "html.parser") + container = soup.select_one(".parts_column_02") + if not container: + return None + date_str = data["date"] + timestamp = int(datetime.strptime(date_str, "%Y/%m/%d").replace(tzinfo=timezone.utc).timestamp()) + first_p = container.find("p") + content = first_p.get_text(" ", strip=True) if first_p else "" + images = [] + for img in container.select("img"): + src = img.get("src") + if not src: + continue + src = src.replace("./", "").lstrip("/") + img_url = f"{image_base}/{src}" + parent = img.find_parent("a") + images.append({ + "image": img_url, + "link": urljoin(BASE_URL, parent.get("href")) if parent and parent.get("href") else None + }) + data["type"] = "["+region_text+"]" + " " + data["type"] + data["identifier"] = identifier + data["timestamp"] = timestamp + data["content"] = content + data["images"] = images + data["is_ai_summary"] = False + return data + + def six_rr_plus_extractor(html: str, data: dict): + image_base = BASE_URL + "/" + internal_path + soup = BeautifulSoup(html, "html.parser") + container = soup.select_one(".parts_column_02") + if not container: + return None + date_str = data["date"] + timestamp = int(datetime.strptime(date_str, "%Y/%m/%d").replace(tzinfo=timezone.utc).timestamp()) + first_p = container.find("p") + content = first_p.get_text(" ", strip=True) if first_p else "" + images = [] + for img in container.select("img"): + src = img.get("src") + if not src: + continue + src = src.replace("./", "").lstrip("/") + img_url = f"{image_base}/{src}" + parent = img.find_parent("a") + images.append({ + "image": img_url, + "link": urljoin(BASE_URL, parent.get("href")) if parent and parent.get("href") else None + }) + data["type"] = "["+region_text+"]" + " " + data["type"] data["identifier"] = identifier data["timestamp"] = timestamp data["content"] = content @@ -91,6 +217,14 @@ def make_wmmt_news_extractor(identifier: str, version: constants.WANGAN_MAXI_VER if version == constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS: return five_dx_plus_extractor + elif version == constants.WANGAN_MAXI_VERSION.SIX_RR: + return six_rr_extractor + elif version == constants.WANGAN_MAXI_VERSION.SIX_RR_PLUS: + return six_rr_plus_extractor get_wmmt_na_news_post_links = make_wmmt_parser(constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS) -parse_wmmt_na_news = make_wmmt_news_extractor("WANGAN_MAXI_NA", constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS, "wanganmaxi5dxplus/na") +get_wmmt_asia_oce_news_post_links = make_wmmt_parser(constants.WANGAN_MAXI_VERSION.SIX_RR) +get_wmmt_jp_news_post_links = make_wmmt_parser(constants.WANGAN_MAXI_VERSION.SIX_RR_PLUS) +parse_wmmt_na_news = make_wmmt_news_extractor("WANGAN_MAXI_NA", constants.WANGAN_MAXI_VERSION.FIVE_DX_PLUS, "wanganmaxi5dxplus/na", "NA") +parse_wmmt_asia_oce_news = make_wmmt_news_extractor("WANGAN_MAXI_ASIA_OCE", constants.WANGAN_MAXI_VERSION.SIX_RR, "wanganmaxi6rr/en", "ASIA/OCE") +parse_wmmt_jp_news = make_wmmt_news_extractor("WANGAN_MAXI_JP", constants.WANGAN_MAXI_VERSION.SIX_RR_PLUS, "wanganmaxi6rrplus/jp", "JPN") diff --git a/generate.py b/generate.py index a6ef392..fec2295 100644 --- a/generate.py +++ b/generate.py @@ -128,6 +128,9 @@ def generate_music_diver_news_file(): def generate_taiko_news_file(): return generate_news_file("taiko_news", constants.TAIKO_BLOG_SITE) +def generate_wmmt_news_file(): + return generate_news_file("wmmt_news", constants.WANGAN_MAXI_GENERIC) + def generate_wacca_plus_news_file(): return generate_news_file("wacca_plus_news", constants.WACCA_PLUS_MAGIC_STRING) @@ -162,6 +165,7 @@ if __name__ == "__main__": wacca_plus_news = generate_wacca_plus_news_file() museca_plus_news = generate_museca_plus_news_file() generate_rbdx_plus_news_file() + wmmt_news = generate_wmmt_news_file() @@ -180,6 +184,7 @@ if __name__ == "__main__": chunithm_intl_news_data, music_diver_news_data, taiko_news_data, + wmmt_news, wacca_plus_news, museca_plus_news, polaris_news_data, diff --git a/news_feed.py b/news_feed.py index d621984..e6e125e 100644 --- a/news_feed.py +++ b/news_feed.py @@ -173,9 +173,28 @@ def get_news(news_url: str, version=None) -> list: prelim_na_news_data = wmmt.get_wmmt_na_news_post_links(na_site_data) for data in prelim_na_news_data: post_site_data = download_site_as_html(data["url"]) - news_posts.append(wmmt.parse_wmmt_na_news(post_site_data, data)) - print(news_posts) - exit() + news = wmmt.parse_wmmt_na_news(post_site_data, data) + if news is not None: + news_posts.append(news) + asia_oce_site_data = download_site_as_html(constants.WANGAN_MAXI_ASIA_OCE_NEWS_SITE) + prelim_asia_oce_news_data = wmmt.get_wmmt_asia_oce_news_post_links(asia_oce_site_data) + for data in prelim_asia_oce_news_data: + post_site_data = download_site_as_html(data["url"]) + news = wmmt.parse_wmmt_asia_oce_news(post_site_data, data) + if news is not None: + news_posts.append(news) + jp_site_data = download_site_as_html(constants.WANGAN_MAXI_JP_NEWS_SITE, response_encoding="utf-8") + prelim_jp_news_data = wmmt.get_wmmt_jp_news_post_links(jp_site_data) + jp_news = [] + for data in prelim_jp_news_data: + post_site_data = download_site_as_html(data["url"], response_encoding="utf-8") + news = wmmt.parse_wmmt_jp_news(post_site_data, data) + if news is not None: + jp_news.append(news) + jp_news = translate.add_translate_text_to_en(jp_news) + news_posts.extend(jp_news) + news_posts = sorted(news_posts, key=lambda x: x['timestamp'], reverse=True) + return news_posts elif news_url == constants.WACCA_PLUS_MAGIC_STRING: diff --git a/site_scraper.py b/site_scraper.py index 9efa4b6..e9301b5 100644 --- a/site_scraper.py +++ b/site_scraper.py @@ -68,7 +68,7 @@ class SiteScraper: print("WebDriver closed successfully") -def download_site_as_html(url: str, timeout: int = 10) -> str: +def download_site_as_html(url: str, timeout: int = 10, response_encoding=None) -> str: headers = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " @@ -84,6 +84,8 @@ def download_site_as_html(url: str, timeout: int = 10) -> str: try: response = requests.get(url, headers=headers, timeout=timeout) + if response_encoding: + response.encoding = response_encoding response.raise_for_status() return response.text except requests.RequestException as e: |
