refactor: move to common NewsSource interface

cleanup imports by defining initaliazers modules and decorator remove legacy scrapers remove single factory for sega games (sites don't change that much)
author: Pinapelz <yukais@pinapelz.com> 2026-03-12 13:56:30 -0700
committer: Pinapelz <yukais@pinapelz.com> 2026-03-12 13:56:50 -0700
commit: caa3cf245186ab0f6fb33e63a7dd838d834da12e (patch)
tree: bc5742a134ecabf0b9d35cc12b1d6f67defd5da7 /taito
parent: 5658441ab9b703c95a48e654d41e45cc3a55ffd3 (diff)
3 files changed, 74 insertions, 78 deletions
diff --git a/taito/__init__.py b/taito/__init__.py
new file mode 100644
index 0000000..bc55d25
--- /dev/null
+++ b/taito/__init__.py
@@ -0,0 +1,7 @@
+from taito.music_diver import parse_music_diver_news_json
+from taito.street_fighter import parse_sf_news_site
+
+__all__ = [
+    "parse_music_diver_news_json",
+    "parse_sf_news_site",
+]
+\ No newline at end of file
diff --git a/taito/music_diver.py b/taito/music_diver.py
index 5469ad5..efab0b0 100644
--- a/taito/music_diver.py
+++ b/taito/music_diver.py
@@ -52,6 +52,7 @@ def parse_music_diver_news_json(data_str: str):
             "headline": post["title"],
             "content": content,
             "url": None,
-            "images": images
+            "images": images,
+            "is_ai_summary": False
         })
     return news_posts
diff --git a/taito/street_fighter.py b/taito/street_fighter.py
index 987b72b..bf58090 100644
--- a/taito/street_fighter.py
+++ b/taito/street_fighter.py
@@ -3,15 +3,12 @@ from bs4 import BeautifulSoup
 import re
 from datetime import datetime
 from urllib.parse import urljoin
-from enum import Enum
 from constants import STREET_FIGHTER_NEWS_SITE
 import requests
 import base64
 
 IMAGE_LIMIT = 10 # only allow 10 images to be processed as b64 is expensive to store
 
-class ParserVersion(Enum):
-    ALPHA = 1
 
 def _convert_image_to_base64(img_url: str):
     headers = {
@@ -26,81 +23,72 @@ def _convert_image_to_base64(img_url: str):
     else:
         raise Exception(f"Failed to fetch image from URL: {img_url}, status code: {response.status_code}")
 
-def make_sf_parser(identifier: str, parser: ParserVersion):
-    def alpha_parser(html: str):
-        soup = BeautifulSoup(html, "html.parser")
-        news_entries = []
-        img_processed = 0
-        news_links = soup.find_all('a', class_='btn_latestnews')
-        for link in news_links:
-            try:
-                url = link.get('href', '')
-                if url.startswith('/'):
-                    url = urljoin(STREET_FIGHTER_NEWS_SITE, url)
-                info_p = link.find('p', class_='info_list_event')
-                if not info_p:
-                    continue
-                date_span = info_p.find('span', class_='latestnews_date')
-                if not date_span:
-                    continue
-                date_text = date_span.get_text(strip=True)
-                date_match = re.match(r'(\d{4}-\d{2}-\d{2})\s+(\d{2}:\d{2})\s*　(.+)', date_text)
-                if not date_match:
-                    continue
-                date_str = date_match.group(1)
-                time_str = date_match.group(2)
-                datetime_str = f"{date_str} {time_str}"
-                try:
-                    post_date = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M")
-                    timestamp = int(post_date.timestamp())
-                except ValueError:
-                    continue
-                headline_span = info_p.find('span', class_='info_list_txt')
-                headline = headline_span.get_text(strip=True) if headline_span else ""
-                headline = re.sub(r'<br\s*/?>', ' ', headline)
-                headline = re.sub(r'\s+', ' ', headline).strip()
-                images = []
-                img_div = link.find('div', class_='image')
-                if img_div:
-                    img_tag = img_div.find('img')
-                    if img_tag:
-                        img_src = img_tag.get('src', '')
-                        if img_src.startswith('/'):
-                            img_src = urljoin('https://sf6ta.jp', img_src)
-                        if img_processed <= IMAGE_LIMIT:
-                            try:
-                                img_b64 = _convert_image_to_base64(img_src)
-                                images.append({
-                                    'image': img_b64,
-                                    'link': url
-                                })
-                            except Exception:
-                                pass # Failed likely due to 403. Just show no images in that case
-                            img_processed += 1
-                news_entry = {
-                    'date': post_date.strftime("%Y-%m-%d %H:%M"),
-                    'identifier': identifier,
-                    'type': None,
-                    'timestamp': timestamp,
-                    'headline': None,
-                    'content': headline, # content should be prio-ed over headline
-                    'url': url,
-                    'images': images,
-                    'is_ai_summary': False
-                }
-                news_entries.append(news_entry)
 
-            except Exception as e:
+def parse_sf_news_site(html: str):
+    identifier = "STREET_FIGHTER"
+    soup = BeautifulSoup(html, "html.parser")
+    news_entries = []
+    img_processed = 0
+    news_links = soup.find_all('a', class_='btn_latestnews')
+    for link in news_links:
+        try:
+            url = link.get('href', '')
+            if url.startswith('/'):
+                url = urljoin(STREET_FIGHTER_NEWS_SITE, url)
+            info_p = link.find('p', class_='info_list_event')
+            if not info_p:
                 continue
+            date_span = info_p.find('span', class_='latestnews_date')
+            if not date_span:
+                continue
+            date_text = date_span.get_text(strip=True)
+            date_match = re.match(r'(\d{4}-\d{2}-\d{2})\s+(\d{2}:\d{2})\s*　(.+)', date_text)
+            if not date_match:
+                continue
+            date_str = date_match.group(1)
+            time_str = date_match.group(2)
+            datetime_str = f"{date_str} {time_str}"
+            try:
+                post_date = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M")
+                timestamp = int(post_date.timestamp())
+            except ValueError:
+                continue
+            headline_span = info_p.find('span', class_='info_list_txt')
+            headline = headline_span.get_text(strip=True) if headline_span else ""
+            headline = re.sub(r'<br\s*/?>', ' ', headline)
+            headline = re.sub(r'\s+', ' ', headline).strip()
+            images = []
+            img_div = link.find('div', class_='image')
+            if img_div:
+                img_tag = img_div.find('img')
+                if img_tag:
+                    img_src = img_tag.get('src', '')
+                    if img_src.startswith('/'):
+                        img_src = urljoin('https://sf6ta.jp', img_src)
+                    if img_processed <= IMAGE_LIMIT:
+                        try:
+                            img_b64 = _convert_image_to_base64(img_src)
+                            images.append({
+                                'image': img_b64,
+                                'link': url
+                            })
+                        except Exception:
+                            pass  # Failed likely due to 403. Just show no images in that case
+                        img_processed += 1
+            news_entry = {
+                'date': post_date.strftime("%Y-%m-%d %H:%M"),
+                'identifier': identifier,
+                'type': None,
+                'timestamp': timestamp,
+                'headline': None,
+                'content': headline,  # content should be prio-ed over headline
+                'url': url,
+                'images': images,
+                'is_ai_summary': False
+            }
+            news_entries.append(news_entry)
 
-        return news_entries
-
-    if parser == ParserVersion.ALPHA:
-        return alpha_parser
-    else:
-        raise ValueError("Unknown Parser Version")
-
+        except Exception:
+            continue
 
-parse_sf_news_site = make_sf_parser(
-    "STREET_FIGHTER", ParserVersion.ALPHA
-)
+    return news_entries
+\ No newline at end of file
author	Pinapelz <yukais@pinapelz.com>	2026-03-12 13:56:30 -0700
committer	Pinapelz <yukais@pinapelz.com>	2026-03-12 13:56:50 -0700
commit	caa3cf245186ab0f6fb33e63a7dd838d834da12e (patch)
tree	bc5742a134ecabf0b9d35cc12b1d6f67defd5da7 /taito
parent	5658441ab9b703c95a48e654d41e45cc3a55ffd3 (diff)