refactor: move to common NewsSource interface

cleanup imports by defining initaliazers modules and decorator remove legacy scrapers remove single factory for sega games (sites don't change that much)
author: Pinapelz <yukais@pinapelz.com> 2026-03-12 13:56:30 -0700
committer: Pinapelz <yukais@pinapelz.com> 2026-03-12 13:56:50 -0700
commit: caa3cf245186ab0f6fb33e63a7dd838d834da12e (patch)
tree: bc5742a134ecabf0b9d35cc12b1d6f67defd5da7 /sega
parent: 5658441ab9b703c95a48e654d41e45cc3a55ffd3 (diff)
6 files changed, 257 insertions, 341 deletions
diff --git a/sega/__init__.py b/sega/__init__.py
new file mode 100644
index 0000000..242ab52
--- /dev/null
+++ b/sega/__init__.py
@@ -0,0 +1,23 @@
+from sega.chuni_jp import parse_chuni_jp_news_site, parse_chuni_jp_post_images
+from sega.chuni_intl import (
+    parse_chuni_intl_api_route,
+    parse_chuni_intl_news_site,
+    parse_chuni_intl_post_images,
+)
+from sega.maimaidx_jp import parse_maimaidx_jp_news_site
+from sega.maimaidx_intl import parse_maimaidx_intl_api_route
+from sega.ongeki_jp import parse_ongeki_news_site
+from sega.idac import parse_idac_news_site, get_promo_image
+
+__all__ = [
+    "parse_chuni_jp_news_site",
+    "parse_chuni_jp_post_images",
+    "parse_chuni_intl_api_route",
+    "parse_chuni_intl_news_site",
+    "parse_chuni_intl_post_images",
+    "parse_maimaidx_jp_news_site",
+    "parse_maimaidx_intl_api_route",
+    "parse_ongeki_news_site",
+    "parse_idac_news_site",
+    "get_promo_image",
+]
+\ No newline at end of file
diff --git a/sega/chuni_intl.py b/sega/chuni_intl.py
index 64d279c..816b857 100644
--- a/sega/chuni_intl.py
+++ b/sega/chuni_intl.py
@@ -1,100 +1,11 @@
 import re
 from datetime import datetime, timedelta, timezone
-from enum import Enum
 import json
 from urllib.parse import urljoin
 
 from bs4 import BeautifulSoup
 
 
-class ParserVersion(Enum):
-    ALPHA = 1
-
-
-def make_chuni_intl_parser(identifier: str, parser: ParserVersion):
-    def alpha_parser(html: str):
-        """
-        Confirmed on:
-        LUMINOUS PLUS
-        """
-        soup = BeautifulSoup(html, "html.parser")
-        base_url = "https://info-chunithm.sega.com/"
-        items = soup.select("li.news--list__item")
-        results = []
-
-        for item in items:
-            a_tag = item.select_one("a.news--list__post")
-            if not a_tag:
-                continue
-
-            url = urljoin(base_url, a_tag["href"])
-            date_text = item.select_one("div.news--date").text.strip()
-            headline = item.select_one("p.news--title").text.strip()
-            img_tag = item.select_one("div.news--thumbnail img")
-            image_url = urljoin(base_url, img_tag["src"]) if img_tag else None
-
-            date_match = re.match(r"(\d{4})\.(\d{1,2})\.(\d{1,2})", date_text)
-            if not date_match:
-                continue
-            year, month, day = map(int, date_match.groups())
-            jst = timezone(timedelta(hours=9))
-            dt = datetime(year, month, day, tzinfo=jst)
-            timestamp = int(dt.timestamp())
-
-            results.append(
-                {
-                    "date": dt.strftime("%Y-%m-%d"),
-                    "identifier": identifier,
-                    "type": None,
-                    "timestamp": timestamp,
-                    "headline": None,
-                    "content": headline,
-                    "url": url,
-                    "images": [{"image": image_url, "link": url}] if image_url else [],
-                    'is_ai_summary': False
-                }
-            )
-
-        return results
-
-    if parser == ParserVersion.ALPHA:
-        return alpha_parser
-
-
-def make_image_extractor(version: ParserVersion):
-    """
-    Gets all the images from a full post page as CHUNITHM intl has more relevant images
-    hidden in the actual posts
-    """
-
-    def image_extractor_alpha(html: str):
-        base_url = "https://info-chunithm.sega.com/"
-        soup = BeautifulSoup(html, "html.parser")
-        images = []
-        news_post = soup.select_one(".news--post")
-        if not news_post:
-            return images
-
-        for img in news_post.find_all("img"):
-            src = img.get("src") or img.get("data-src")
-            if not src:
-                continue
-
-            full_url = urljoin(base_url, src)
-            parent = img.find_parent("a")
-            link = parent.get("href") if parent and parent.name == "a" else None
-
-            images.append(
-                {"image": full_url, "link": urljoin(base_url, link) if link else None}
-            )
-
-        return images
-
-    if version == ParserVersion.ALPHA:
-        return image_extractor_alpha
-    else:
-        raise ValueError("Unknown Parser Version")
-
 def parse_chuni_intl_api_route(raw_api_data: str, identifier: str, limit: int):
     route_data = json.loads(raw_api_data)
     route_data = route_data[:limit]
@@ -126,7 +37,76 @@ def parse_chuni_intl_api_route(raw_api_data: str, identifier: str, limit: int):
     return entries
 
 
-parse_chuni_intl_news_site = make_chuni_intl_parser(
-    "CHUNITHM_INTL", ParserVersion.ALPHA
-)
-parse_chuni_intl_post_images = make_image_extractor(ParserVersion.ALPHA)
+def parse_chuni_intl_post_images(html: str):
+    """
+    Gets all the images from a full post page as CHUNITHM intl has more relevant images
+    hidden in the actual posts.
+    """
+    base_url = "https://info-chunithm.sega.com/"
+    soup = BeautifulSoup(html, "html.parser")
+    images = []
+    news_post = soup.select_one(".news--post")
+    if not news_post:
+        return images
+
+    for img in news_post.find_all("img"):
+        src = img.get("src") or img.get("data-src")
+        if not src:
+            continue
+
+        full_url = urljoin(base_url, src)
+        parent = img.find_parent("a")
+        link = parent.get("href") if parent and parent.name == "a" else None
+
+        images.append(
+            {"image": full_url, "link": urljoin(base_url, link) if link else None}
+        )
+
+    return images
+
+
+def parse_chuni_intl_news_site(html: str):
+    """
+    Confirmed on:
+    LUMINOUS PLUS
+    """
+    identifier = "CHUNITHM_INTL"
+    soup = BeautifulSoup(html, "html.parser")
+    base_url = "https://info-chunithm.sega.com/"
+    items = soup.select("li.news--list__item")
+    results = []
+
+    for item in items:
+        a_tag = item.select_one("a.news--list__post")
+        if not a_tag:
+            continue
+
+        url = urljoin(base_url, a_tag["href"])
+        date_text = item.select_one("div.news--date").text.strip()
+        headline = item.select_one("p.news--title").text.strip()
+        img_tag = item.select_one("div.news--thumbnail img")
+        image_url = urljoin(base_url, img_tag["src"]) if img_tag else None
+
+        date_match = re.match(r"(\d{4})\.(\d{1,2})\.(\d{1,2})", date_text)
+        if not date_match:
+            continue
+        year, month, day = map(int, date_match.groups())
+        jst = timezone(timedelta(hours=9))
+        dt = datetime(year, month, day, tzinfo=jst)
+        timestamp = int(dt.timestamp())
+
+        results.append(
+            {
+                "date": dt.strftime("%Y-%m-%d"),
+                "identifier": identifier,
+                "type": None,
+                "timestamp": timestamp,
+                "headline": None,
+                "content": headline,
+                "url": url,
+                "images": [{"image": image_url, "link": url}] if image_url else [],
+                "is_ai_summary": False,
+            }
+        )
+
+    return results
+\ No newline at end of file
diff --git a/sega/chuni_jp.py b/sega/chuni_jp.py
index 452e153..a914270 100644
--- a/sega/chuni_jp.py
+++ b/sega/chuni_jp.py
@@ -1,114 +1,93 @@
 import re
 from datetime import datetime, timedelta, timezone
-from enum import Enum
 from urllib.parse import urljoin
 
 from bs4 import BeautifulSoup
 
 
-class ParserVersion(Enum):
-    ALPHA = 1
-
-
-def make_chuni_jp_parser(identifier: str, parser: ParserVersion):
-    def alpha_parser(html: str):
-        """
-        Confirmed on:
-        VERSE
-        """
-        soup = BeautifulSoup(html, "html.parser")
-        news_entries = []
-        news_wrapper = soup.find("div", class_="newsMainWrapper-left")
-        if not news_wrapper:
-            return news_entries
-        for a_tag in news_wrapper.find_all("a", href=True):
-            if not a_tag.find("div", class_="chuniCommonBox-inner"):
-                continue
-            news_dict = {}
-            news_url = a_tag.get("href")
-            news_dict["url"] = news_url
+def parse_chuni_jp_news_site(html: str):
+    """
+    Confirmed on:
+    VERSE
+    """
+    identifier = "CHUNITHM_JP"
+    soup = BeautifulSoup(html, "html.parser")
+    news_entries = []
+    news_wrapper = soup.find("div", class_="newsMainWrapper-left")
+    if not news_wrapper:
+        return news_entries
+    for a_tag in news_wrapper.find_all("a", href=True):
+        if not a_tag.find("div", class_="chuniCommonBox-inner"):
+            continue
+        news_dict = {}
+        news_url = a_tag.get("href")
+        news_dict["url"] = news_url
 
-            date_container = a_tag.find("div", class_="chuniCommonBox-inner-title")
-            date_str = None
-            if date_container:
-                title_span = date_container.find("span", class_="title")
-                if title_span:
-                    text = title_span.get_text(strip=True)
-                    date_match = re.search(r"(\d{4}\.\d{2}\.\d{2})", text)
-                    if date_match:
-                        date_str = date_match.group(1)
-            news_dict["date"] = date_str
-            news_dict["type"] = None
-            timestamp = None
-            if date_str:
-                try:
-                    dt = datetime.strptime(date_str, "%Y.%m.%d")
-                    dt = dt.replace(tzinfo=timezone(timedelta(hours=9)))
-                    timestamp = int(dt.timestamp())
-                except Exception:
-                    timestamp = None
-            news_dict["timestamp"] = timestamp
+        date_container = a_tag.find("div", class_="chuniCommonBox-inner-title")
+        date_str = None
+        if date_container:
+            title_span = date_container.find("span", class_="title")
+            if title_span:
+                text = title_span.get_text(strip=True)
+                date_match = re.search(r"(\d{4}\.\d{2}\.\d{2})", text)
+                if date_match:
+                    date_str = date_match.group(1)
+        news_dict["date"] = date_str
+        news_dict["type"] = None
+        timestamp = None
+        if date_str:
+            try:
+                dt = datetime.strptime(date_str, "%Y.%m.%d")
+                dt = dt.replace(tzinfo=timezone(timedelta(hours=9)))
+                timestamp = int(dt.timestamp())
+            except Exception:
+                timestamp = None
+        news_dict["timestamp"] = timestamp
 
-            main_content = a_tag.find("div", class_="chuniCommonBox-inner-main")
-            content_text = ""
-            if main_content:
-                content_text = main_content.get_text(separator=" ", strip=True)
-            news_dict["content"] = content_text
+        main_content = a_tag.find("div", class_="chuniCommonBox-inner-main")
+        content_text = ""
+        if main_content:
+            content_text = main_content.get_text(separator=" ", strip=True)
+        news_dict["content"] = content_text
 
-            images = {"image": None, "link": None}
-            if main_content:
-                img_tag = main_content.find("img")
-                if img_tag:
-                    images["image"] = img_tag.get("src")
-                    images["link"] = news_url
-            news_dict["images"] = [images]
-            news_dict["identifier"] = identifier
-            news_dict["is_ai_summary"] = False
+        images = {"image": None, "link": None}
+        if main_content:
+            img_tag = main_content.find("img")
+            if img_tag:
+                images["image"] = img_tag.get("src")
+                images["link"] = news_url
+        news_dict["images"] = [images]
+        news_dict["identifier"] = identifier
+        news_dict["is_ai_summary"] = False
 
-            news_entries.append(news_dict)
+        news_entries.append(news_dict)
 
-        return news_entries
+    return news_entries
 
-    if parser == ParserVersion.ALPHA:
-        return alpha_parser
 
-
-def make_image_extractor(version: ParserVersion):
+def parse_chuni_jp_post_images(html: str):
     """
-    Gets all the images from a full post page as CHUNITHM intl has more relevant images
-    hidden in the actual posts
+    Gets all the images from a full post page as CHUNITHM JP has more relevant images
+    hidden in the actual posts.
     """
+    base_url = "https://info-chunithm.sega.jp/"
+    soup = BeautifulSoup(html, "html.parser")
+    images = []
 
-    def image_extractor_alpha(html: str):
-        base_url = "https://info-chunithm.sega.jp/"
-        soup = BeautifulSoup(html, "html.parser")
-        images = []
-
-        container = soup.select_one(".chuniCommonBox-inner-main")
-        if not container:
-            return images
-        for img in container.find_all("img"):
-            if img.find_parent("p") and "©" in img.find_parent("p").text:
-                continue
-
-            src = img.get("src") or img.get("data-src")
-            if not src:
-                continue
-            full_url = urljoin(base_url, src)
-            parent = img.find_parent("a")
-            link = parent.get("href") if parent and parent.name == "a" else None
-            images.append(
-                {"image": full_url, "link": urljoin(base_url, link) if link else None}
-            )
+    container = soup.select_one(".chuniCommonBox-inner-main")
+    if not container:
         return images
+    for img in container.find_all("img"):
+        if img.find_parent("p") and "©" in img.find_parent("p").text:
+            continue
 
-    if version == ParserVersion.ALPHA:
-        return image_extractor_alpha
-    else:
-        raise ValueError("Unknown Parser Version")
-
-
-parse_chuni_jp_news_site = make_chuni_jp_parser(
-    "CHUNITHM_JP", ParserVersion.ALPHA
-)
-parse_chuni_jp_post_images = make_image_extractor(ParserVersion.ALPHA)
+        src = img.get("src") or img.get("data-src")
+        if not src:
+            continue
+        full_url = urljoin(base_url, src)
+        parent = img.find_parent("a")
+        link = parent.get("href") if parent and parent.name == "a" else None
+        images.append(
+            {"image": full_url, "link": urljoin(base_url, link) if link else None}
+        )
+    return images
diff --git a/sega/maimaidx_intl.py b/sega/maimaidx_intl.py
index 3e26a37..8182117 100644
--- a/sega/maimaidx_intl.py
+++ b/sega/maimaidx_intl.py
@@ -1,53 +1,7 @@
 from bs4 import BeautifulSoup
 from datetime import datetime, timezone, timedelta
-from enum import Enum
 import json
 
-class ParserVersion(Enum):
-    ALPHA=1
-
-def make_maimaidx_intl_parser(identifier: str, parser: ParserVersion):
-    """
-    Parses the download page of maimai dx intl site. API route method below is preferred as information is the same
-    """
-    def alpha_parser(html: str):
-        """
-        Confirmed on:
-        PRISM
-        """
-        soup = BeautifulSoup(html, "html.parser")
-        items = soup.select(".dl--pop__item")
-
-        entries = []
-        for item in items:
-            date_text = item.select_one(".dl--pop__head").text.strip().replace(" UP", "")
-            dt = datetime.strptime(date_text, "%Y.%m.%d").replace(tzinfo=timezone(timedelta(hours=9)))
-            timestamp = int(dt.timestamp())
-
-            img_tag = item.select_one("a.dl--pop__thumb img")
-            image_url = img_tag["srcset"] if img_tag else None
-            full_image_url = image_url.replace("../", "https://maimai.sega.com/") if image_url else None
-
-            entry = {
-                "date": date_text,
-                "identifier": identifier,
-                "type": None,
-                "timestamp": timestamp,
-                "headline": None,
-                "content": f"New maimai DX International News / maimai DX International の新しいお知らせ\n\n{full_image_url}",
-                "url": None,
-                "images": [
-                    {
-                        "image": full_image_url,
-                        "link": None
-                    }
-                ],
-                'is_ai_summary': False
-            }
-            entries.append(entry)
-        return entries
-    if parser == ParserVersion.ALPHA:
-        return alpha_parser
 
 def parse_maimaidx_intl_api_route(raw_api_data: str, identifier: str, limit: int):
     route_data = json.loads(raw_api_data)
@@ -84,6 +38,3 @@ def parse_maimaidx_intl_api_route(raw_api_data: str, identifier: str, limit: int
         }
         entries.append(entry)
     return entries
-
-
-parse_maimaidx_intl_news_site = make_maimaidx_intl_parser("MAIMAIDX_INTL", ParserVersion.ALPHA)
diff --git a/sega/maimaidx_jp.py b/sega/maimaidx_jp.py
index 1314325..2b61c9a 100644
--- a/sega/maimaidx_jp.py
+++ b/sega/maimaidx_jp.py
@@ -1,60 +1,53 @@
 from bs4 import BeautifulSoup
 from datetime import datetime, timezone, timedelta
 from urllib.parse import urljoin
-from enum import Enum
 
-class ParserVersion(Enum):
-    ALPHA=1
 
-def make_maimaidx_jpn_parser(identifier: str, parser: ParserVersion):
-    def alpha_parser(html: str):
-        """
-        Confirmed on:
-        PRISM PLUS
-        """
-        soup = BeautifulSoup(html, "html.parser")
-        base_url = "https://info-maimai.sega.jp/"
-        news_items = []
+def parse_maimaidx_jp_news_site(html: str):
+    """
+    Confirmed on:
+    PRISM PLUS
+    """
+    identifier = "MAIMAIDX_JP"
+    soup = BeautifulSoup(html, "html.parser")
+    base_url = "https://info-maimai.sega.jp/"
+    news_items = []
 
-        news_boxes = soup.select(".maiPager-content .newsBox")
-        for box in news_boxes:
-            a_tag = box.select_one("a")
-            url = urljoin(base_url, a_tag["href"]) if a_tag and a_tag.get("href") else None
+    news_boxes = soup.select(".maiPager-content .newsBox")
+    for box in news_boxes:
+        a_tag = box.select_one("a")
+        url = urljoin(base_url, a_tag["href"]) if a_tag and a_tag.get("href") else None
 
-            img_tag = box.select_one("img")
-            image_url = urljoin(base_url, img_tag["src"]) if img_tag else None
+        img_tag = box.select_one("img")
+        image_url = urljoin(base_url, img_tag["src"]) if img_tag else None
 
-            date_tag = box.select_one(".newsDate")
-            raw_date = date_tag.get_text(strip=True) if date_tag else None
+        date_tag = box.select_one(".newsDate")
+        raw_date = date_tag.get_text(strip=True) if date_tag else None
 
-            jst = timezone(timedelta(hours=9))
-            try:
-                dt = datetime.strptime(raw_date.split(" ")[0], "%Y.%m.%d").replace(tzinfo=jst)
-                timestamp = int(dt.timestamp())
-            except:
-                dt = None
-                timestamp = 0
+        jst = timezone(timedelta(hours=9))
+        try:
+            dt = datetime.strptime(raw_date.split(" ")[0], "%Y.%m.%d").replace(tzinfo=jst)
+            timestamp = int(dt.timestamp())
+        except Exception:
+            dt = None
+            timestamp = 0
 
-            content_tag = box.select_one(".newsLink")
-            content = content_tag.get_text(strip=True) if content_tag else None
+        content_tag = box.select_one(".newsLink")
+        content = content_tag.get_text(strip=True) if content_tag else None
 
-            news_items.append({
-                "date": raw_date,
-                "identifier": identifier,
-                "type": None,
-                "timestamp": timestamp,
-                "headline": None,
-                "content": content,
-                "url": url,
-                'is_ai_summary': False,
-                "images": [{
-                    "image": image_url,
-                    "link": url
-                }] if image_url else []
-            })
+        news_items.append({
+            "date": raw_date,
+            "identifier": identifier,
+            "type": None,
+            "timestamp": timestamp,
+            "headline": None,
+            "content": content,
+            "url": url,
+            "is_ai_summary": False,
+            "images": [{
+                "image": image_url,
+                "link": url
+            }] if image_url else []
+        })
 
-        return news_items
-    if parser == ParserVersion.ALPHA:
-        return alpha_parser
-
-parse_maimaidx_jp_news_site = make_maimaidx_jpn_parser("MAIMAIDX_JP", ParserVersion.ALPHA)
+    return news_items
+\ No newline at end of file
diff --git a/sega/ongeki_jp.py b/sega/ongeki_jp.py
index f9c2dc4..c173189 100644
--- a/sega/ongeki_jp.py
+++ b/sega/ongeki_jp.py
@@ -1,68 +1,58 @@
-import time
-from datetime import datetime
-from enum import Enum
+from datetime import datetime, timezone, timedelta
 
 from bs4 import BeautifulSoup
 
+JST = timezone(timedelta(hours=9))
 
-class ParserVersion(Enum):
-    ALPHA = 1
 
+def parse_ongeki_news_site(html: str):
+    identifier = "ONGEKI_JPN"
+    soup = BeautifulSoup(html, "html.parser")
+    items = []
 
-def make_ongeki_parser(identifier: str, parser: ParserVersion):
-    def alpha_parser(html: str):
-        soup = BeautifulSoup(html, "html.parser")
-        items = []
+    for li in soup.select("li.p-news__listChild"):
+        a_tag = li.select_one("a.p-news__listLink")
+        url = a_tag["href"] if a_tag else None
 
-        for li in soup.select("li.p-news__listChild"):
-            a_tag = li.select_one("a.p-news__listLink")
-            url = a_tag["href"] if a_tag else None
+        img_tag = li.select_one(".p-news__listThumb img")
+        image_url = img_tag["src"] if img_tag else None
+        image_alt = img_tag["alt"] if img_tag else ""
+        image_link = url if image_url else None
 
-            img_tag = li.select_one(".p-news__listThumb img")
-            image_url = img_tag["src"] if img_tag else None
-            image_alt = img_tag["alt"] if img_tag else ""
-            image_link = url if image_url else None
+        date_type_text = li.select_one(".p-news__listTextUpper")
+        date_text = (
+            date_type_text.text.strip().split("/")[0].strip()
+            if date_type_text
+            else None
+        )
+        type_text = (
+            date_type_text.text.strip().split("/")[-1].strip()
+            if date_type_text and "/" in date_type_text.text
+            else None
+        )
 
-            date_type_text = li.select_one(".p-news__listTextUpper")
-            date_text = (
-                date_type_text.text.strip().split("/")[0].strip()
-                if date_type_text
-                else None
-            )
-            type_text = (
-                date_type_text.text.strip().split("/")[-1].strip()
-                if "/" in date_type_text.text
-                else None
-            )
+        timestamp = None
+        if date_text:
+            try:
+                dt = datetime.strptime(date_text, "%Y.%m.%d %a").replace(tzinfo=JST)
+                timestamp = int(dt.timestamp())
+            except Exception:
+                timestamp = None
 
-            timestamp = None
-            if date_text:
-                try:
-                    dt = datetime.strptime(date_text, "%Y.%m.%d %a")
-                    timestamp = int(time.mktime(dt.timetuple()))
-                except:
-                    timestamp = None
+        entry = {
+            "date": date_text,
+            "identifier": identifier,
+            "type": type_text if type_text not in ["GAME", "CARDMAKER"] else None,
+            "timestamp": timestamp,
+            "headline": None,
+            "content": image_alt,
+            "url": url,
+            "is_ai_summary": False,
+            "images": [{"image": image_url, "link": image_link}]
+            if image_url
+            else [],
+        }
 
-            entry = {
-                "date": date_text,
-                "identifier": identifier,
-                "type": type_text if type_text not in ["GAME", "CARDMAKER"] else None,
-                "timestamp": timestamp,
-                "headline": None,
-                "content": image_alt,
-                "url": url,
-                "is_ai_summary": False,
-                "images": [{"image": image_url, "link": image_link}]
-                if image_url
-                else [],
-            }
+        items.append(entry)
 
-            items.append(entry)
-
-        return items
-
-    if parser == ParserVersion.ALPHA:
-        return alpha_parser
-
-
-parse_ongeki_news_site = make_ongeki_parser("ONGEKI_JPN", ParserVersion.ALPHA)
+    return items
+\ No newline at end of file
author	Pinapelz <yukais@pinapelz.com>	2026-03-12 13:56:30 -0700
committer	Pinapelz <yukais@pinapelz.com>	2026-03-12 13:56:50 -0700
commit	caa3cf245186ab0f6fb33e63a7dd838d834da12e (patch)
tree	bc5742a134ecabf0b9d35cc12b1d6f67defd5da7 /sega
parent	5658441ab9b703c95a48e654d41e45cc3a55ffd3 (diff)