From caa3cf245186ab0f6fb33e63a7dd838d834da12e Mon Sep 17 00:00:00 2001 From: Pinapelz Date: Thu, 12 Mar 2026 13:56:30 -0700 Subject: refactor: move to common NewsSource interface cleanup imports by defining initaliazers modules and decorator remove legacy scrapers remove single factory for sega games (sites don't change that much) --- sega/__init__.py | 23 +++++++ sega/chuni_intl.py | 166 ++++++++++++++++++++------------------------- sega/chuni_jp.py | 183 ++++++++++++++++++++++---------------------------- sega/maimaidx_intl.py | 49 -------------- sega/maimaidx_jp.py | 107 ++++++++++++++--------------- sega/ongeki_jp.py | 120 +++++++++++++++------------------ 6 files changed, 282 insertions(+), 366 deletions(-) create mode 100644 sega/__init__.py (limited to 'sega') diff --git a/sega/__init__.py b/sega/__init__.py new file mode 100644 index 0000000..242ab52 --- /dev/null +++ b/sega/__init__.py @@ -0,0 +1,23 @@ +from sega.chuni_jp import parse_chuni_jp_news_site, parse_chuni_jp_post_images +from sega.chuni_intl import ( + parse_chuni_intl_api_route, + parse_chuni_intl_news_site, + parse_chuni_intl_post_images, +) +from sega.maimaidx_jp import parse_maimaidx_jp_news_site +from sega.maimaidx_intl import parse_maimaidx_intl_api_route +from sega.ongeki_jp import parse_ongeki_news_site +from sega.idac import parse_idac_news_site, get_promo_image + +__all__ = [ + "parse_chuni_jp_news_site", + "parse_chuni_jp_post_images", + "parse_chuni_intl_api_route", + "parse_chuni_intl_news_site", + "parse_chuni_intl_post_images", + "parse_maimaidx_jp_news_site", + "parse_maimaidx_intl_api_route", + "parse_ongeki_news_site", + "parse_idac_news_site", + "get_promo_image", +] \ No newline at end of file diff --git a/sega/chuni_intl.py b/sega/chuni_intl.py index 64d279c..816b857 100644 --- a/sega/chuni_intl.py +++ b/sega/chuni_intl.py @@ -1,100 +1,11 @@ import re from datetime import datetime, timedelta, timezone -from enum import Enum import json from urllib.parse import urljoin from bs4 import BeautifulSoup -class ParserVersion(Enum): - ALPHA = 1 - - -def make_chuni_intl_parser(identifier: str, parser: ParserVersion): - def alpha_parser(html: str): - """ - Confirmed on: - LUMINOUS PLUS - """ - soup = BeautifulSoup(html, "html.parser") - base_url = "https://info-chunithm.sega.com/" - items = soup.select("li.news--list__item") - results = [] - - for item in items: - a_tag = item.select_one("a.news--list__post") - if not a_tag: - continue - - url = urljoin(base_url, a_tag["href"]) - date_text = item.select_one("div.news--date").text.strip() - headline = item.select_one("p.news--title").text.strip() - img_tag = item.select_one("div.news--thumbnail img") - image_url = urljoin(base_url, img_tag["src"]) if img_tag else None - - date_match = re.match(r"(\d{4})\.(\d{1,2})\.(\d{1,2})", date_text) - if not date_match: - continue - year, month, day = map(int, date_match.groups()) - jst = timezone(timedelta(hours=9)) - dt = datetime(year, month, day, tzinfo=jst) - timestamp = int(dt.timestamp()) - - results.append( - { - "date": dt.strftime("%Y-%m-%d"), - "identifier": identifier, - "type": None, - "timestamp": timestamp, - "headline": None, - "content": headline, - "url": url, - "images": [{"image": image_url, "link": url}] if image_url else [], - 'is_ai_summary': False - } - ) - - return results - - if parser == ParserVersion.ALPHA: - return alpha_parser - - -def make_image_extractor(version: ParserVersion): - """ - Gets all the images from a full post page as CHUNITHM intl has more relevant images - hidden in the actual posts - """ - - def image_extractor_alpha(html: str): - base_url = "https://info-chunithm.sega.com/" - soup = BeautifulSoup(html, "html.parser") - images = [] - news_post = soup.select_one(".news--post") - if not news_post: - return images - - for img in news_post.find_all("img"): - src = img.get("src") or img.get("data-src") - if not src: - continue - - full_url = urljoin(base_url, src) - parent = img.find_parent("a") - link = parent.get("href") if parent and parent.name == "a" else None - - images.append( - {"image": full_url, "link": urljoin(base_url, link) if link else None} - ) - - return images - - if version == ParserVersion.ALPHA: - return image_extractor_alpha - else: - raise ValueError("Unknown Parser Version") - def parse_chuni_intl_api_route(raw_api_data: str, identifier: str, limit: int): route_data = json.loads(raw_api_data) route_data = route_data[:limit] @@ -126,7 +37,76 @@ def parse_chuni_intl_api_route(raw_api_data: str, identifier: str, limit: int): return entries -parse_chuni_intl_news_site = make_chuni_intl_parser( - "CHUNITHM_INTL", ParserVersion.ALPHA -) -parse_chuni_intl_post_images = make_image_extractor(ParserVersion.ALPHA) +def parse_chuni_intl_post_images(html: str): + """ + Gets all the images from a full post page as CHUNITHM intl has more relevant images + hidden in the actual posts. + """ + base_url = "https://info-chunithm.sega.com/" + soup = BeautifulSoup(html, "html.parser") + images = [] + news_post = soup.select_one(".news--post") + if not news_post: + return images + + for img in news_post.find_all("img"): + src = img.get("src") or img.get("data-src") + if not src: + continue + + full_url = urljoin(base_url, src) + parent = img.find_parent("a") + link = parent.get("href") if parent and parent.name == "a" else None + + images.append( + {"image": full_url, "link": urljoin(base_url, link) if link else None} + ) + + return images + + +def parse_chuni_intl_news_site(html: str): + """ + Confirmed on: + LUMINOUS PLUS + """ + identifier = "CHUNITHM_INTL" + soup = BeautifulSoup(html, "html.parser") + base_url = "https://info-chunithm.sega.com/" + items = soup.select("li.news--list__item") + results = [] + + for item in items: + a_tag = item.select_one("a.news--list__post") + if not a_tag: + continue + + url = urljoin(base_url, a_tag["href"]) + date_text = item.select_one("div.news--date").text.strip() + headline = item.select_one("p.news--title").text.strip() + img_tag = item.select_one("div.news--thumbnail img") + image_url = urljoin(base_url, img_tag["src"]) if img_tag else None + + date_match = re.match(r"(\d{4})\.(\d{1,2})\.(\d{1,2})", date_text) + if not date_match: + continue + year, month, day = map(int, date_match.groups()) + jst = timezone(timedelta(hours=9)) + dt = datetime(year, month, day, tzinfo=jst) + timestamp = int(dt.timestamp()) + + results.append( + { + "date": dt.strftime("%Y-%m-%d"), + "identifier": identifier, + "type": None, + "timestamp": timestamp, + "headline": None, + "content": headline, + "url": url, + "images": [{"image": image_url, "link": url}] if image_url else [], + "is_ai_summary": False, + } + ) + + return results \ No newline at end of file diff --git a/sega/chuni_jp.py b/sega/chuni_jp.py index 452e153..a914270 100644 --- a/sega/chuni_jp.py +++ b/sega/chuni_jp.py @@ -1,114 +1,93 @@ import re from datetime import datetime, timedelta, timezone -from enum import Enum from urllib.parse import urljoin from bs4 import BeautifulSoup -class ParserVersion(Enum): - ALPHA = 1 - - -def make_chuni_jp_parser(identifier: str, parser: ParserVersion): - def alpha_parser(html: str): - """ - Confirmed on: - VERSE - """ - soup = BeautifulSoup(html, "html.parser") - news_entries = [] - news_wrapper = soup.find("div", class_="newsMainWrapper-left") - if not news_wrapper: - return news_entries - for a_tag in news_wrapper.find_all("a", href=True): - if not a_tag.find("div", class_="chuniCommonBox-inner"): - continue - news_dict = {} - news_url = a_tag.get("href") - news_dict["url"] = news_url - - date_container = a_tag.find("div", class_="chuniCommonBox-inner-title") - date_str = None - if date_container: - title_span = date_container.find("span", class_="title") - if title_span: - text = title_span.get_text(strip=True) - date_match = re.search(r"(\d{4}\.\d{2}\.\d{2})", text) - if date_match: - date_str = date_match.group(1) - news_dict["date"] = date_str - news_dict["type"] = None - timestamp = None - if date_str: - try: - dt = datetime.strptime(date_str, "%Y.%m.%d") - dt = dt.replace(tzinfo=timezone(timedelta(hours=9))) - timestamp = int(dt.timestamp()) - except Exception: - timestamp = None - news_dict["timestamp"] = timestamp - - main_content = a_tag.find("div", class_="chuniCommonBox-inner-main") - content_text = "" - if main_content: - content_text = main_content.get_text(separator=" ", strip=True) - news_dict["content"] = content_text - - images = {"image": None, "link": None} - if main_content: - img_tag = main_content.find("img") - if img_tag: - images["image"] = img_tag.get("src") - images["link"] = news_url - news_dict["images"] = [images] - news_dict["identifier"] = identifier - news_dict["is_ai_summary"] = False - - news_entries.append(news_dict) - +def parse_chuni_jp_news_site(html: str): + """ + Confirmed on: + VERSE + """ + identifier = "CHUNITHM_JP" + soup = BeautifulSoup(html, "html.parser") + news_entries = [] + news_wrapper = soup.find("div", class_="newsMainWrapper-left") + if not news_wrapper: return news_entries - - if parser == ParserVersion.ALPHA: - return alpha_parser - - -def make_image_extractor(version: ParserVersion): + for a_tag in news_wrapper.find_all("a", href=True): + if not a_tag.find("div", class_="chuniCommonBox-inner"): + continue + news_dict = {} + news_url = a_tag.get("href") + news_dict["url"] = news_url + + date_container = a_tag.find("div", class_="chuniCommonBox-inner-title") + date_str = None + if date_container: + title_span = date_container.find("span", class_="title") + if title_span: + text = title_span.get_text(strip=True) + date_match = re.search(r"(\d{4}\.\d{2}\.\d{2})", text) + if date_match: + date_str = date_match.group(1) + news_dict["date"] = date_str + news_dict["type"] = None + timestamp = None + if date_str: + try: + dt = datetime.strptime(date_str, "%Y.%m.%d") + dt = dt.replace(tzinfo=timezone(timedelta(hours=9))) + timestamp = int(dt.timestamp()) + except Exception: + timestamp = None + news_dict["timestamp"] = timestamp + + main_content = a_tag.find("div", class_="chuniCommonBox-inner-main") + content_text = "" + if main_content: + content_text = main_content.get_text(separator=" ", strip=True) + news_dict["content"] = content_text + + images = {"image": None, "link": None} + if main_content: + img_tag = main_content.find("img") + if img_tag: + images["image"] = img_tag.get("src") + images["link"] = news_url + news_dict["images"] = [images] + news_dict["identifier"] = identifier + news_dict["is_ai_summary"] = False + + news_entries.append(news_dict) + + return news_entries + + +def parse_chuni_jp_post_images(html: str): """ - Gets all the images from a full post page as CHUNITHM intl has more relevant images - hidden in the actual posts + Gets all the images from a full post page as CHUNITHM JP has more relevant images + hidden in the actual posts. """ + base_url = "https://info-chunithm.sega.jp/" + soup = BeautifulSoup(html, "html.parser") + images = [] - def image_extractor_alpha(html: str): - base_url = "https://info-chunithm.sega.jp/" - soup = BeautifulSoup(html, "html.parser") - images = [] - - container = soup.select_one(".chuniCommonBox-inner-main") - if not container: - return images - for img in container.find_all("img"): - if img.find_parent("p") and "©" in img.find_parent("p").text: - continue - - src = img.get("src") or img.get("data-src") - if not src: - continue - full_url = urljoin(base_url, src) - parent = img.find_parent("a") - link = parent.get("href") if parent and parent.name == "a" else None - images.append( - {"image": full_url, "link": urljoin(base_url, link) if link else None} - ) + container = soup.select_one(".chuniCommonBox-inner-main") + if not container: return images - - if version == ParserVersion.ALPHA: - return image_extractor_alpha - else: - raise ValueError("Unknown Parser Version") - - -parse_chuni_jp_news_site = make_chuni_jp_parser( - "CHUNITHM_JP", ParserVersion.ALPHA -) -parse_chuni_jp_post_images = make_image_extractor(ParserVersion.ALPHA) + for img in container.find_all("img"): + if img.find_parent("p") and "©" in img.find_parent("p").text: + continue + + src = img.get("src") or img.get("data-src") + if not src: + continue + full_url = urljoin(base_url, src) + parent = img.find_parent("a") + link = parent.get("href") if parent and parent.name == "a" else None + images.append( + {"image": full_url, "link": urljoin(base_url, link) if link else None} + ) + return images diff --git a/sega/maimaidx_intl.py b/sega/maimaidx_intl.py index 3e26a37..8182117 100644 --- a/sega/maimaidx_intl.py +++ b/sega/maimaidx_intl.py @@ -1,53 +1,7 @@ from bs4 import BeautifulSoup from datetime import datetime, timezone, timedelta -from enum import Enum import json -class ParserVersion(Enum): - ALPHA=1 - -def make_maimaidx_intl_parser(identifier: str, parser: ParserVersion): - """ - Parses the download page of maimai dx intl site. API route method below is preferred as information is the same - """ - def alpha_parser(html: str): - """ - Confirmed on: - PRISM - """ - soup = BeautifulSoup(html, "html.parser") - items = soup.select(".dl--pop__item") - - entries = [] - for item in items: - date_text = item.select_one(".dl--pop__head").text.strip().replace(" UP", "") - dt = datetime.strptime(date_text, "%Y.%m.%d").replace(tzinfo=timezone(timedelta(hours=9))) - timestamp = int(dt.timestamp()) - - img_tag = item.select_one("a.dl--pop__thumb img") - image_url = img_tag["srcset"] if img_tag else None - full_image_url = image_url.replace("../", "https://maimai.sega.com/") if image_url else None - - entry = { - "date": date_text, - "identifier": identifier, - "type": None, - "timestamp": timestamp, - "headline": None, - "content": f"New maimai DX International News / maimai DX International の新しいお知らせ\n\n{full_image_url}", - "url": None, - "images": [ - { - "image": full_image_url, - "link": None - } - ], - 'is_ai_summary': False - } - entries.append(entry) - return entries - if parser == ParserVersion.ALPHA: - return alpha_parser def parse_maimaidx_intl_api_route(raw_api_data: str, identifier: str, limit: int): route_data = json.loads(raw_api_data) @@ -84,6 +38,3 @@ def parse_maimaidx_intl_api_route(raw_api_data: str, identifier: str, limit: int } entries.append(entry) return entries - - -parse_maimaidx_intl_news_site = make_maimaidx_intl_parser("MAIMAIDX_INTL", ParserVersion.ALPHA) diff --git a/sega/maimaidx_jp.py b/sega/maimaidx_jp.py index 1314325..2b61c9a 100644 --- a/sega/maimaidx_jp.py +++ b/sega/maimaidx_jp.py @@ -1,60 +1,53 @@ from bs4 import BeautifulSoup from datetime import datetime, timezone, timedelta from urllib.parse import urljoin -from enum import Enum - -class ParserVersion(Enum): - ALPHA=1 - -def make_maimaidx_jpn_parser(identifier: str, parser: ParserVersion): - def alpha_parser(html: str): - """ - Confirmed on: - PRISM PLUS - """ - soup = BeautifulSoup(html, "html.parser") - base_url = "https://info-maimai.sega.jp/" - news_items = [] - - news_boxes = soup.select(".maiPager-content .newsBox") - for box in news_boxes: - a_tag = box.select_one("a") - url = urljoin(base_url, a_tag["href"]) if a_tag and a_tag.get("href") else None - - img_tag = box.select_one("img") - image_url = urljoin(base_url, img_tag["src"]) if img_tag else None - - date_tag = box.select_one(".newsDate") - raw_date = date_tag.get_text(strip=True) if date_tag else None - - jst = timezone(timedelta(hours=9)) - try: - dt = datetime.strptime(raw_date.split(" ")[0], "%Y.%m.%d").replace(tzinfo=jst) - timestamp = int(dt.timestamp()) - except: - dt = None - timestamp = 0 - - content_tag = box.select_one(".newsLink") - content = content_tag.get_text(strip=True) if content_tag else None - - news_items.append({ - "date": raw_date, - "identifier": identifier, - "type": None, - "timestamp": timestamp, - "headline": None, - "content": content, - "url": url, - 'is_ai_summary': False, - "images": [{ - "image": image_url, - "link": url - }] if image_url else [] - }) - - return news_items - if parser == ParserVersion.ALPHA: - return alpha_parser - -parse_maimaidx_jp_news_site = make_maimaidx_jpn_parser("MAIMAIDX_JP", ParserVersion.ALPHA) + + +def parse_maimaidx_jp_news_site(html: str): + """ + Confirmed on: + PRISM PLUS + """ + identifier = "MAIMAIDX_JP" + soup = BeautifulSoup(html, "html.parser") + base_url = "https://info-maimai.sega.jp/" + news_items = [] + + news_boxes = soup.select(".maiPager-content .newsBox") + for box in news_boxes: + a_tag = box.select_one("a") + url = urljoin(base_url, a_tag["href"]) if a_tag and a_tag.get("href") else None + + img_tag = box.select_one("img") + image_url = urljoin(base_url, img_tag["src"]) if img_tag else None + + date_tag = box.select_one(".newsDate") + raw_date = date_tag.get_text(strip=True) if date_tag else None + + jst = timezone(timedelta(hours=9)) + try: + dt = datetime.strptime(raw_date.split(" ")[0], "%Y.%m.%d").replace(tzinfo=jst) + timestamp = int(dt.timestamp()) + except Exception: + dt = None + timestamp = 0 + + content_tag = box.select_one(".newsLink") + content = content_tag.get_text(strip=True) if content_tag else None + + news_items.append({ + "date": raw_date, + "identifier": identifier, + "type": None, + "timestamp": timestamp, + "headline": None, + "content": content, + "url": url, + "is_ai_summary": False, + "images": [{ + "image": image_url, + "link": url + }] if image_url else [] + }) + + return news_items \ No newline at end of file diff --git a/sega/ongeki_jp.py b/sega/ongeki_jp.py index f9c2dc4..c173189 100644 --- a/sega/ongeki_jp.py +++ b/sega/ongeki_jp.py @@ -1,68 +1,58 @@ -import time -from datetime import datetime -from enum import Enum +from datetime import datetime, timezone, timedelta from bs4 import BeautifulSoup - -class ParserVersion(Enum): - ALPHA = 1 - - -def make_ongeki_parser(identifier: str, parser: ParserVersion): - def alpha_parser(html: str): - soup = BeautifulSoup(html, "html.parser") - items = [] - - for li in soup.select("li.p-news__listChild"): - a_tag = li.select_one("a.p-news__listLink") - url = a_tag["href"] if a_tag else None - - img_tag = li.select_one(".p-news__listThumb img") - image_url = img_tag["src"] if img_tag else None - image_alt = img_tag["alt"] if img_tag else "" - image_link = url if image_url else None - - date_type_text = li.select_one(".p-news__listTextUpper") - date_text = ( - date_type_text.text.strip().split("/")[0].strip() - if date_type_text - else None - ) - type_text = ( - date_type_text.text.strip().split("/")[-1].strip() - if "/" in date_type_text.text - else None - ) - - timestamp = None - if date_text: - try: - dt = datetime.strptime(date_text, "%Y.%m.%d %a") - timestamp = int(time.mktime(dt.timetuple())) - except: - timestamp = None - - entry = { - "date": date_text, - "identifier": identifier, - "type": type_text if type_text not in ["GAME", "CARDMAKER"] else None, - "timestamp": timestamp, - "headline": None, - "content": image_alt, - "url": url, - "is_ai_summary": False, - "images": [{"image": image_url, "link": image_link}] - if image_url - else [], - } - - items.append(entry) - - return items - - if parser == ParserVersion.ALPHA: - return alpha_parser - - -parse_ongeki_news_site = make_ongeki_parser("ONGEKI_JPN", ParserVersion.ALPHA) +JST = timezone(timedelta(hours=9)) + + +def parse_ongeki_news_site(html: str): + identifier = "ONGEKI_JPN" + soup = BeautifulSoup(html, "html.parser") + items = [] + + for li in soup.select("li.p-news__listChild"): + a_tag = li.select_one("a.p-news__listLink") + url = a_tag["href"] if a_tag else None + + img_tag = li.select_one(".p-news__listThumb img") + image_url = img_tag["src"] if img_tag else None + image_alt = img_tag["alt"] if img_tag else "" + image_link = url if image_url else None + + date_type_text = li.select_one(".p-news__listTextUpper") + date_text = ( + date_type_text.text.strip().split("/")[0].strip() + if date_type_text + else None + ) + type_text = ( + date_type_text.text.strip().split("/")[-1].strip() + if date_type_text and "/" in date_type_text.text + else None + ) + + timestamp = None + if date_text: + try: + dt = datetime.strptime(date_text, "%Y.%m.%d %a").replace(tzinfo=JST) + timestamp = int(dt.timestamp()) + except Exception: + timestamp = None + + entry = { + "date": date_text, + "identifier": identifier, + "type": type_text if type_text not in ["GAME", "CARDMAKER"] else None, + "timestamp": timestamp, + "headline": None, + "content": image_alt, + "url": url, + "is_ai_summary": False, + "images": [{"image": image_url, "link": image_link}] + if image_url + else [], + } + + items.append(entry) + + return items \ No newline at end of file -- cgit v1.2.3