diff options
| author | Pinapelz <yukais@pinapelz.com> | 2026-03-12 13:56:30 -0700 |
|---|---|---|
| committer | Pinapelz <yukais@pinapelz.com> | 2026-03-12 13:56:50 -0700 |
| commit | caa3cf245186ab0f6fb33e63a7dd838d834da12e (patch) | |
| tree | bc5742a134ecabf0b9d35cc12b1d6f67defd5da7 /sega | |
| parent | 5658441ab9b703c95a48e654d41e45cc3a55ffd3 (diff) | |
refactor: move to common NewsSource interface
cleanup imports by defining initaliazers modules and decorator
remove legacy scrapers
remove single factory for sega games (sites don't change that much)
Diffstat (limited to 'sega')
| -rw-r--r-- | sega/__init__.py | 23 | ||||
| -rw-r--r-- | sega/chuni_intl.py | 166 | ||||
| -rw-r--r-- | sega/chuni_jp.py | 169 | ||||
| -rw-r--r-- | sega/maimaidx_intl.py | 49 | ||||
| -rw-r--r-- | sega/maimaidx_jp.py | 89 | ||||
| -rw-r--r-- | sega/ongeki_jp.py | 102 |
6 files changed, 257 insertions, 341 deletions
diff --git a/sega/__init__.py b/sega/__init__.py new file mode 100644 index 0000000..242ab52 --- /dev/null +++ b/sega/__init__.py @@ -0,0 +1,23 @@ +from sega.chuni_jp import parse_chuni_jp_news_site, parse_chuni_jp_post_images +from sega.chuni_intl import ( + parse_chuni_intl_api_route, + parse_chuni_intl_news_site, + parse_chuni_intl_post_images, +) +from sega.maimaidx_jp import parse_maimaidx_jp_news_site +from sega.maimaidx_intl import parse_maimaidx_intl_api_route +from sega.ongeki_jp import parse_ongeki_news_site +from sega.idac import parse_idac_news_site, get_promo_image + +__all__ = [ + "parse_chuni_jp_news_site", + "parse_chuni_jp_post_images", + "parse_chuni_intl_api_route", + "parse_chuni_intl_news_site", + "parse_chuni_intl_post_images", + "parse_maimaidx_jp_news_site", + "parse_maimaidx_intl_api_route", + "parse_ongeki_news_site", + "parse_idac_news_site", + "get_promo_image", +]
\ No newline at end of file diff --git a/sega/chuni_intl.py b/sega/chuni_intl.py index 64d279c..816b857 100644 --- a/sega/chuni_intl.py +++ b/sega/chuni_intl.py @@ -1,100 +1,11 @@ import re from datetime import datetime, timedelta, timezone -from enum import Enum import json from urllib.parse import urljoin from bs4 import BeautifulSoup -class ParserVersion(Enum): - ALPHA = 1 - - -def make_chuni_intl_parser(identifier: str, parser: ParserVersion): - def alpha_parser(html: str): - """ - Confirmed on: - LUMINOUS PLUS - """ - soup = BeautifulSoup(html, "html.parser") - base_url = "https://info-chunithm.sega.com/" - items = soup.select("li.news--list__item") - results = [] - - for item in items: - a_tag = item.select_one("a.news--list__post") - if not a_tag: - continue - - url = urljoin(base_url, a_tag["href"]) - date_text = item.select_one("div.news--date").text.strip() - headline = item.select_one("p.news--title").text.strip() - img_tag = item.select_one("div.news--thumbnail img") - image_url = urljoin(base_url, img_tag["src"]) if img_tag else None - - date_match = re.match(r"(\d{4})\.(\d{1,2})\.(\d{1,2})", date_text) - if not date_match: - continue - year, month, day = map(int, date_match.groups()) - jst = timezone(timedelta(hours=9)) - dt = datetime(year, month, day, tzinfo=jst) - timestamp = int(dt.timestamp()) - - results.append( - { - "date": dt.strftime("%Y-%m-%d"), - "identifier": identifier, - "type": None, - "timestamp": timestamp, - "headline": None, - "content": headline, - "url": url, - "images": [{"image": image_url, "link": url}] if image_url else [], - 'is_ai_summary': False - } - ) - - return results - - if parser == ParserVersion.ALPHA: - return alpha_parser - - -def make_image_extractor(version: ParserVersion): - """ - Gets all the images from a full post page as CHUNITHM intl has more relevant images - hidden in the actual posts - """ - - def image_extractor_alpha(html: str): - base_url = "https://info-chunithm.sega.com/" - soup = BeautifulSoup(html, "html.parser") - images = [] - news_post = soup.select_one(".news--post") - if not news_post: - return images - - for img in news_post.find_all("img"): - src = img.get("src") or img.get("data-src") - if not src: - continue - - full_url = urljoin(base_url, src) - parent = img.find_parent("a") - link = parent.get("href") if parent and parent.name == "a" else None - - images.append( - {"image": full_url, "link": urljoin(base_url, link) if link else None} - ) - - return images - - if version == ParserVersion.ALPHA: - return image_extractor_alpha - else: - raise ValueError("Unknown Parser Version") - def parse_chuni_intl_api_route(raw_api_data: str, identifier: str, limit: int): route_data = json.loads(raw_api_data) route_data = route_data[:limit] @@ -126,7 +37,76 @@ def parse_chuni_intl_api_route(raw_api_data: str, identifier: str, limit: int): return entries -parse_chuni_intl_news_site = make_chuni_intl_parser( - "CHUNITHM_INTL", ParserVersion.ALPHA -) -parse_chuni_intl_post_images = make_image_extractor(ParserVersion.ALPHA) +def parse_chuni_intl_post_images(html: str): + """ + Gets all the images from a full post page as CHUNITHM intl has more relevant images + hidden in the actual posts. + """ + base_url = "https://info-chunithm.sega.com/" + soup = BeautifulSoup(html, "html.parser") + images = [] + news_post = soup.select_one(".news--post") + if not news_post: + return images + + for img in news_post.find_all("img"): + src = img.get("src") or img.get("data-src") + if not src: + continue + + full_url = urljoin(base_url, src) + parent = img.find_parent("a") + link = parent.get("href") if parent and parent.name == "a" else None + + images.append( + {"image": full_url, "link": urljoin(base_url, link) if link else None} + ) + + return images + + +def parse_chuni_intl_news_site(html: str): + """ + Confirmed on: + LUMINOUS PLUS + """ + identifier = "CHUNITHM_INTL" + soup = BeautifulSoup(html, "html.parser") + base_url = "https://info-chunithm.sega.com/" + items = soup.select("li.news--list__item") + results = [] + + for item in items: + a_tag = item.select_one("a.news--list__post") + if not a_tag: + continue + + url = urljoin(base_url, a_tag["href"]) + date_text = item.select_one("div.news--date").text.strip() + headline = item.select_one("p.news--title").text.strip() + img_tag = item.select_one("div.news--thumbnail img") + image_url = urljoin(base_url, img_tag["src"]) if img_tag else None + + date_match = re.match(r"(\d{4})\.(\d{1,2})\.(\d{1,2})", date_text) + if not date_match: + continue + year, month, day = map(int, date_match.groups()) + jst = timezone(timedelta(hours=9)) + dt = datetime(year, month, day, tzinfo=jst) + timestamp = int(dt.timestamp()) + + results.append( + { + "date": dt.strftime("%Y-%m-%d"), + "identifier": identifier, + "type": None, + "timestamp": timestamp, + "headline": None, + "content": headline, + "url": url, + "images": [{"image": image_url, "link": url}] if image_url else [], + "is_ai_summary": False, + } + ) + + return results
\ No newline at end of file diff --git a/sega/chuni_jp.py b/sega/chuni_jp.py index 452e153..a914270 100644 --- a/sega/chuni_jp.py +++ b/sega/chuni_jp.py @@ -1,114 +1,93 @@ import re from datetime import datetime, timedelta, timezone -from enum import Enum from urllib.parse import urljoin from bs4 import BeautifulSoup -class ParserVersion(Enum): - ALPHA = 1 - - -def make_chuni_jp_parser(identifier: str, parser: ParserVersion): - def alpha_parser(html: str): - """ - Confirmed on: - VERSE - """ - soup = BeautifulSoup(html, "html.parser") - news_entries = [] - news_wrapper = soup.find("div", class_="newsMainWrapper-left") - if not news_wrapper: - return news_entries - for a_tag in news_wrapper.find_all("a", href=True): - if not a_tag.find("div", class_="chuniCommonBox-inner"): - continue - news_dict = {} - news_url = a_tag.get("href") - news_dict["url"] = news_url +def parse_chuni_jp_news_site(html: str): + """ + Confirmed on: + VERSE + """ + identifier = "CHUNITHM_JP" + soup = BeautifulSoup(html, "html.parser") + news_entries = [] + news_wrapper = soup.find("div", class_="newsMainWrapper-left") + if not news_wrapper: + return news_entries + for a_tag in news_wrapper.find_all("a", href=True): + if not a_tag.find("div", class_="chuniCommonBox-inner"): + continue + news_dict = {} + news_url = a_tag.get("href") + news_dict["url"] = news_url - date_container = a_tag.find("div", class_="chuniCommonBox-inner-title") - date_str = None - if date_container: - title_span = date_container.find("span", class_="title") - if title_span: - text = title_span.get_text(strip=True) - date_match = re.search(r"(\d{4}\.\d{2}\.\d{2})", text) - if date_match: - date_str = date_match.group(1) - news_dict["date"] = date_str - news_dict["type"] = None - timestamp = None - if date_str: - try: - dt = datetime.strptime(date_str, "%Y.%m.%d") - dt = dt.replace(tzinfo=timezone(timedelta(hours=9))) - timestamp = int(dt.timestamp()) - except Exception: - timestamp = None - news_dict["timestamp"] = timestamp + date_container = a_tag.find("div", class_="chuniCommonBox-inner-title") + date_str = None + if date_container: + title_span = date_container.find("span", class_="title") + if title_span: + text = title_span.get_text(strip=True) + date_match = re.search(r"(\d{4}\.\d{2}\.\d{2})", text) + if date_match: + date_str = date_match.group(1) + news_dict["date"] = date_str + news_dict["type"] = None + timestamp = None + if date_str: + try: + dt = datetime.strptime(date_str, "%Y.%m.%d") + dt = dt.replace(tzinfo=timezone(timedelta(hours=9))) + timestamp = int(dt.timestamp()) + except Exception: + timestamp = None + news_dict["timestamp"] = timestamp - main_content = a_tag.find("div", class_="chuniCommonBox-inner-main") - content_text = "" - if main_content: - content_text = main_content.get_text(separator=" ", strip=True) - news_dict["content"] = content_text + main_content = a_tag.find("div", class_="chuniCommonBox-inner-main") + content_text = "" + if main_content: + content_text = main_content.get_text(separator=" ", strip=True) + news_dict["content"] = content_text - images = {"image": None, "link": None} - if main_content: - img_tag = main_content.find("img") - if img_tag: - images["image"] = img_tag.get("src") - images["link"] = news_url - news_dict["images"] = [images] - news_dict["identifier"] = identifier - news_dict["is_ai_summary"] = False + images = {"image": None, "link": None} + if main_content: + img_tag = main_content.find("img") + if img_tag: + images["image"] = img_tag.get("src") + images["link"] = news_url + news_dict["images"] = [images] + news_dict["identifier"] = identifier + news_dict["is_ai_summary"] = False - news_entries.append(news_dict) + news_entries.append(news_dict) - return news_entries + return news_entries - if parser == ParserVersion.ALPHA: - return alpha_parser - -def make_image_extractor(version: ParserVersion): +def parse_chuni_jp_post_images(html: str): """ - Gets all the images from a full post page as CHUNITHM intl has more relevant images - hidden in the actual posts + Gets all the images from a full post page as CHUNITHM JP has more relevant images + hidden in the actual posts. """ + base_url = "https://info-chunithm.sega.jp/" + soup = BeautifulSoup(html, "html.parser") + images = [] - def image_extractor_alpha(html: str): - base_url = "https://info-chunithm.sega.jp/" - soup = BeautifulSoup(html, "html.parser") - images = [] - - container = soup.select_one(".chuniCommonBox-inner-main") - if not container: - return images - for img in container.find_all("img"): - if img.find_parent("p") and "©" in img.find_parent("p").text: - continue - - src = img.get("src") or img.get("data-src") - if not src: - continue - full_url = urljoin(base_url, src) - parent = img.find_parent("a") - link = parent.get("href") if parent and parent.name == "a" else None - images.append( - {"image": full_url, "link": urljoin(base_url, link) if link else None} - ) + container = soup.select_one(".chuniCommonBox-inner-main") + if not container: return images + for img in container.find_all("img"): + if img.find_parent("p") and "©" in img.find_parent("p").text: + continue - if version == ParserVersion.ALPHA: - return image_extractor_alpha - else: - raise ValueError("Unknown Parser Version") - - -parse_chuni_jp_news_site = make_chuni_jp_parser( - "CHUNITHM_JP", ParserVersion.ALPHA -) -parse_chuni_jp_post_images = make_image_extractor(ParserVersion.ALPHA) + src = img.get("src") or img.get("data-src") + if not src: + continue + full_url = urljoin(base_url, src) + parent = img.find_parent("a") + link = parent.get("href") if parent and parent.name == "a" else None + images.append( + {"image": full_url, "link": urljoin(base_url, link) if link else None} + ) + return images diff --git a/sega/maimaidx_intl.py b/sega/maimaidx_intl.py index 3e26a37..8182117 100644 --- a/sega/maimaidx_intl.py +++ b/sega/maimaidx_intl.py @@ -1,53 +1,7 @@ from bs4 import BeautifulSoup from datetime import datetime, timezone, timedelta -from enum import Enum import json -class ParserVersion(Enum): - ALPHA=1 - -def make_maimaidx_intl_parser(identifier: str, parser: ParserVersion): - """ - Parses the download page of maimai dx intl site. API route method below is preferred as information is the same - """ - def alpha_parser(html: str): - """ - Confirmed on: - PRISM - """ - soup = BeautifulSoup(html, "html.parser") - items = soup.select(".dl--pop__item") - - entries = [] - for item in items: - date_text = item.select_one(".dl--pop__head").text.strip().replace(" UP", "") - dt = datetime.strptime(date_text, "%Y.%m.%d").replace(tzinfo=timezone(timedelta(hours=9))) - timestamp = int(dt.timestamp()) - - img_tag = item.select_one("a.dl--pop__thumb img") - image_url = img_tag["srcset"] if img_tag else None - full_image_url = image_url.replace("../", "https://maimai.sega.com/") if image_url else None - - entry = { - "date": date_text, - "identifier": identifier, - "type": None, - "timestamp": timestamp, - "headline": None, - "content": f"New maimai DX International News / maimai DX International の新しいお知らせ\n\n{full_image_url}", - "url": None, - "images": [ - { - "image": full_image_url, - "link": None - } - ], - 'is_ai_summary': False - } - entries.append(entry) - return entries - if parser == ParserVersion.ALPHA: - return alpha_parser def parse_maimaidx_intl_api_route(raw_api_data: str, identifier: str, limit: int): route_data = json.loads(raw_api_data) @@ -84,6 +38,3 @@ def parse_maimaidx_intl_api_route(raw_api_data: str, identifier: str, limit: int } entries.append(entry) return entries - - -parse_maimaidx_intl_news_site = make_maimaidx_intl_parser("MAIMAIDX_INTL", ParserVersion.ALPHA) diff --git a/sega/maimaidx_jp.py b/sega/maimaidx_jp.py index 1314325..2b61c9a 100644 --- a/sega/maimaidx_jp.py +++ b/sega/maimaidx_jp.py @@ -1,60 +1,53 @@ from bs4 import BeautifulSoup from datetime import datetime, timezone, timedelta from urllib.parse import urljoin -from enum import Enum -class ParserVersion(Enum): - ALPHA=1 -def make_maimaidx_jpn_parser(identifier: str, parser: ParserVersion): - def alpha_parser(html: str): - """ - Confirmed on: - PRISM PLUS - """ - soup = BeautifulSoup(html, "html.parser") - base_url = "https://info-maimai.sega.jp/" - news_items = [] +def parse_maimaidx_jp_news_site(html: str): + """ + Confirmed on: + PRISM PLUS + """ + identifier = "MAIMAIDX_JP" + soup = BeautifulSoup(html, "html.parser") + base_url = "https://info-maimai.sega.jp/" + news_items = [] - news_boxes = soup.select(".maiPager-content .newsBox") - for box in news_boxes: - a_tag = box.select_one("a") - url = urljoin(base_url, a_tag["href"]) if a_tag and a_tag.get("href") else None + news_boxes = soup.select(".maiPager-content .newsBox") + for box in news_boxes: + a_tag = box.select_one("a") + url = urljoin(base_url, a_tag["href"]) if a_tag and a_tag.get("href") else None - img_tag = box.select_one("img") - image_url = urljoin(base_url, img_tag["src"]) if img_tag else None + img_tag = box.select_one("img") + image_url = urljoin(base_url, img_tag["src"]) if img_tag else None - date_tag = box.select_one(".newsDate") - raw_date = date_tag.get_text(strip=True) if date_tag else None + date_tag = box.select_one(".newsDate") + raw_date = date_tag.get_text(strip=True) if date_tag else None - jst = timezone(timedelta(hours=9)) - try: - dt = datetime.strptime(raw_date.split(" ")[0], "%Y.%m.%d").replace(tzinfo=jst) - timestamp = int(dt.timestamp()) - except: - dt = None - timestamp = 0 + jst = timezone(timedelta(hours=9)) + try: + dt = datetime.strptime(raw_date.split(" ")[0], "%Y.%m.%d").replace(tzinfo=jst) + timestamp = int(dt.timestamp()) + except Exception: + dt = None + timestamp = 0 - content_tag = box.select_one(".newsLink") - content = content_tag.get_text(strip=True) if content_tag else None + content_tag = box.select_one(".newsLink") + content = content_tag.get_text(strip=True) if content_tag else None - news_items.append({ - "date": raw_date, - "identifier": identifier, - "type": None, - "timestamp": timestamp, - "headline": None, - "content": content, - "url": url, - 'is_ai_summary': False, - "images": [{ - "image": image_url, - "link": url - }] if image_url else [] - }) + news_items.append({ + "date": raw_date, + "identifier": identifier, + "type": None, + "timestamp": timestamp, + "headline": None, + "content": content, + "url": url, + "is_ai_summary": False, + "images": [{ + "image": image_url, + "link": url + }] if image_url else [] + }) - return news_items - if parser == ParserVersion.ALPHA: - return alpha_parser - -parse_maimaidx_jp_news_site = make_maimaidx_jpn_parser("MAIMAIDX_JP", ParserVersion.ALPHA) + return news_items
\ No newline at end of file diff --git a/sega/ongeki_jp.py b/sega/ongeki_jp.py index f9c2dc4..c173189 100644 --- a/sega/ongeki_jp.py +++ b/sega/ongeki_jp.py @@ -1,68 +1,58 @@ -import time -from datetime import datetime -from enum import Enum +from datetime import datetime, timezone, timedelta from bs4 import BeautifulSoup +JST = timezone(timedelta(hours=9)) -class ParserVersion(Enum): - ALPHA = 1 +def parse_ongeki_news_site(html: str): + identifier = "ONGEKI_JPN" + soup = BeautifulSoup(html, "html.parser") + items = [] -def make_ongeki_parser(identifier: str, parser: ParserVersion): - def alpha_parser(html: str): - soup = BeautifulSoup(html, "html.parser") - items = [] + for li in soup.select("li.p-news__listChild"): + a_tag = li.select_one("a.p-news__listLink") + url = a_tag["href"] if a_tag else None - for li in soup.select("li.p-news__listChild"): - a_tag = li.select_one("a.p-news__listLink") - url = a_tag["href"] if a_tag else None + img_tag = li.select_one(".p-news__listThumb img") + image_url = img_tag["src"] if img_tag else None + image_alt = img_tag["alt"] if img_tag else "" + image_link = url if image_url else None - img_tag = li.select_one(".p-news__listThumb img") - image_url = img_tag["src"] if img_tag else None - image_alt = img_tag["alt"] if img_tag else "" - image_link = url if image_url else None + date_type_text = li.select_one(".p-news__listTextUpper") + date_text = ( + date_type_text.text.strip().split("/")[0].strip() + if date_type_text + else None + ) + type_text = ( + date_type_text.text.strip().split("/")[-1].strip() + if date_type_text and "/" in date_type_text.text + else None + ) - date_type_text = li.select_one(".p-news__listTextUpper") - date_text = ( - date_type_text.text.strip().split("/")[0].strip() - if date_type_text - else None - ) - type_text = ( - date_type_text.text.strip().split("/")[-1].strip() - if "/" in date_type_text.text - else None - ) + timestamp = None + if date_text: + try: + dt = datetime.strptime(date_text, "%Y.%m.%d %a").replace(tzinfo=JST) + timestamp = int(dt.timestamp()) + except Exception: + timestamp = None - timestamp = None - if date_text: - try: - dt = datetime.strptime(date_text, "%Y.%m.%d %a") - timestamp = int(time.mktime(dt.timetuple())) - except: - timestamp = None + entry = { + "date": date_text, + "identifier": identifier, + "type": type_text if type_text not in ["GAME", "CARDMAKER"] else None, + "timestamp": timestamp, + "headline": None, + "content": image_alt, + "url": url, + "is_ai_summary": False, + "images": [{"image": image_url, "link": image_link}] + if image_url + else [], + } - entry = { - "date": date_text, - "identifier": identifier, - "type": type_text if type_text not in ["GAME", "CARDMAKER"] else None, - "timestamp": timestamp, - "headline": None, - "content": image_alt, - "url": url, - "is_ai_summary": False, - "images": [{"image": image_url, "link": image_link}] - if image_url - else [], - } + items.append(entry) - items.append(entry) - - return items - - if parser == ParserVersion.ALPHA: - return alpha_parser - - -parse_ongeki_news_site = make_ongeki_parser("ONGEKI_JPN", ParserVersion.ALPHA) + return items
\ No newline at end of file |
