aboutsummaryrefslogtreecommitdiffstats
path: root/sega/chuni_intl.py
diff options
context:
space:
mode:
authorPinapelz <yukais@pinapelz.com>2026-03-12 13:56:30 -0700
committerPinapelz <yukais@pinapelz.com>2026-03-12 13:56:50 -0700
commitcaa3cf245186ab0f6fb33e63a7dd838d834da12e (patch)
treebc5742a134ecabf0b9d35cc12b1d6f67defd5da7 /sega/chuni_intl.py
parent5658441ab9b703c95a48e654d41e45cc3a55ffd3 (diff)
refactor: move to common NewsSource interface
cleanup imports by defining initaliazers modules and decorator remove legacy scrapers remove single factory for sega games (sites don't change that much)
Diffstat (limited to 'sega/chuni_intl.py')
-rw-r--r--sega/chuni_intl.py166
1 files changed, 73 insertions, 93 deletions
diff --git a/sega/chuni_intl.py b/sega/chuni_intl.py
index 64d279c..816b857 100644
--- a/sega/chuni_intl.py
+++ b/sega/chuni_intl.py
@@ -1,100 +1,11 @@
import re
from datetime import datetime, timedelta, timezone
-from enum import Enum
import json
from urllib.parse import urljoin
from bs4 import BeautifulSoup
-class ParserVersion(Enum):
- ALPHA = 1
-
-
-def make_chuni_intl_parser(identifier: str, parser: ParserVersion):
- def alpha_parser(html: str):
- """
- Confirmed on:
- LUMINOUS PLUS
- """
- soup = BeautifulSoup(html, "html.parser")
- base_url = "https://info-chunithm.sega.com/"
- items = soup.select("li.news--list__item")
- results = []
-
- for item in items:
- a_tag = item.select_one("a.news--list__post")
- if not a_tag:
- continue
-
- url = urljoin(base_url, a_tag["href"])
- date_text = item.select_one("div.news--date").text.strip()
- headline = item.select_one("p.news--title").text.strip()
- img_tag = item.select_one("div.news--thumbnail img")
- image_url = urljoin(base_url, img_tag["src"]) if img_tag else None
-
- date_match = re.match(r"(\d{4})\.(\d{1,2})\.(\d{1,2})", date_text)
- if not date_match:
- continue
- year, month, day = map(int, date_match.groups())
- jst = timezone(timedelta(hours=9))
- dt = datetime(year, month, day, tzinfo=jst)
- timestamp = int(dt.timestamp())
-
- results.append(
- {
- "date": dt.strftime("%Y-%m-%d"),
- "identifier": identifier,
- "type": None,
- "timestamp": timestamp,
- "headline": None,
- "content": headline,
- "url": url,
- "images": [{"image": image_url, "link": url}] if image_url else [],
- 'is_ai_summary': False
- }
- )
-
- return results
-
- if parser == ParserVersion.ALPHA:
- return alpha_parser
-
-
-def make_image_extractor(version: ParserVersion):
- """
- Gets all the images from a full post page as CHUNITHM intl has more relevant images
- hidden in the actual posts
- """
-
- def image_extractor_alpha(html: str):
- base_url = "https://info-chunithm.sega.com/"
- soup = BeautifulSoup(html, "html.parser")
- images = []
- news_post = soup.select_one(".news--post")
- if not news_post:
- return images
-
- for img in news_post.find_all("img"):
- src = img.get("src") or img.get("data-src")
- if not src:
- continue
-
- full_url = urljoin(base_url, src)
- parent = img.find_parent("a")
- link = parent.get("href") if parent and parent.name == "a" else None
-
- images.append(
- {"image": full_url, "link": urljoin(base_url, link) if link else None}
- )
-
- return images
-
- if version == ParserVersion.ALPHA:
- return image_extractor_alpha
- else:
- raise ValueError("Unknown Parser Version")
-
def parse_chuni_intl_api_route(raw_api_data: str, identifier: str, limit: int):
route_data = json.loads(raw_api_data)
route_data = route_data[:limit]
@@ -126,7 +37,76 @@ def parse_chuni_intl_api_route(raw_api_data: str, identifier: str, limit: int):
return entries
-parse_chuni_intl_news_site = make_chuni_intl_parser(
- "CHUNITHM_INTL", ParserVersion.ALPHA
-)
-parse_chuni_intl_post_images = make_image_extractor(ParserVersion.ALPHA)
+def parse_chuni_intl_post_images(html: str):
+ """
+ Gets all the images from a full post page as CHUNITHM intl has more relevant images
+ hidden in the actual posts.
+ """
+ base_url = "https://info-chunithm.sega.com/"
+ soup = BeautifulSoup(html, "html.parser")
+ images = []
+ news_post = soup.select_one(".news--post")
+ if not news_post:
+ return images
+
+ for img in news_post.find_all("img"):
+ src = img.get("src") or img.get("data-src")
+ if not src:
+ continue
+
+ full_url = urljoin(base_url, src)
+ parent = img.find_parent("a")
+ link = parent.get("href") if parent and parent.name == "a" else None
+
+ images.append(
+ {"image": full_url, "link": urljoin(base_url, link) if link else None}
+ )
+
+ return images
+
+
+def parse_chuni_intl_news_site(html: str):
+ """
+ Confirmed on:
+ LUMINOUS PLUS
+ """
+ identifier = "CHUNITHM_INTL"
+ soup = BeautifulSoup(html, "html.parser")
+ base_url = "https://info-chunithm.sega.com/"
+ items = soup.select("li.news--list__item")
+ results = []
+
+ for item in items:
+ a_tag = item.select_one("a.news--list__post")
+ if not a_tag:
+ continue
+
+ url = urljoin(base_url, a_tag["href"])
+ date_text = item.select_one("div.news--date").text.strip()
+ headline = item.select_one("p.news--title").text.strip()
+ img_tag = item.select_one("div.news--thumbnail img")
+ image_url = urljoin(base_url, img_tag["src"]) if img_tag else None
+
+ date_match = re.match(r"(\d{4})\.(\d{1,2})\.(\d{1,2})", date_text)
+ if not date_match:
+ continue
+ year, month, day = map(int, date_match.groups())
+ jst = timezone(timedelta(hours=9))
+ dt = datetime(year, month, day, tzinfo=jst)
+ timestamp = int(dt.timestamp())
+
+ results.append(
+ {
+ "date": dt.strftime("%Y-%m-%d"),
+ "identifier": identifier,
+ "type": None,
+ "timestamp": timestamp,
+ "headline": None,
+ "content": headline,
+ "url": url,
+ "images": [{"image": image_url, "link": url}] if image_url else [],
+ "is_ai_summary": False,
+ }
+ )
+
+ return results \ No newline at end of file
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage