aboutsummaryrefslogtreecommitdiffstats
path: root/taito
diff options
context:
space:
mode:
authorPinapelz <yukais@pinapelz.com>2026-03-12 13:56:30 -0700
committerPinapelz <yukais@pinapelz.com>2026-03-12 13:56:50 -0700
commitcaa3cf245186ab0f6fb33e63a7dd838d834da12e (patch)
treebc5742a134ecabf0b9d35cc12b1d6f67defd5da7 /taito
parent5658441ab9b703c95a48e654d41e45cc3a55ffd3 (diff)
refactor: move to common NewsSource interface
cleanup imports by defining initaliazers modules and decorator remove legacy scrapers remove single factory for sega games (sites don't change that much)
Diffstat (limited to 'taito')
-rw-r--r--taito/__init__.py7
-rw-r--r--taito/music_diver.py3
-rw-r--r--taito/street_fighter.py142
3 files changed, 74 insertions, 78 deletions
diff --git a/taito/__init__.py b/taito/__init__.py
new file mode 100644
index 0000000..bc55d25
--- /dev/null
+++ b/taito/__init__.py
@@ -0,0 +1,7 @@
+from taito.music_diver import parse_music_diver_news_json
+from taito.street_fighter import parse_sf_news_site
+
+__all__ = [
+ "parse_music_diver_news_json",
+ "parse_sf_news_site",
+] \ No newline at end of file
diff --git a/taito/music_diver.py b/taito/music_diver.py
index 5469ad5..efab0b0 100644
--- a/taito/music_diver.py
+++ b/taito/music_diver.py
@@ -52,6 +52,7 @@ def parse_music_diver_news_json(data_str: str):
"headline": post["title"],
"content": content,
"url": None,
- "images": images
+ "images": images,
+ "is_ai_summary": False
})
return news_posts
diff --git a/taito/street_fighter.py b/taito/street_fighter.py
index 987b72b..bf58090 100644
--- a/taito/street_fighter.py
+++ b/taito/street_fighter.py
@@ -3,15 +3,12 @@ from bs4 import BeautifulSoup
import re
from datetime import datetime
from urllib.parse import urljoin
-from enum import Enum
from constants import STREET_FIGHTER_NEWS_SITE
import requests
import base64
IMAGE_LIMIT = 10 # only allow 10 images to be processed as b64 is expensive to store
-class ParserVersion(Enum):
- ALPHA = 1
def _convert_image_to_base64(img_url: str):
headers = {
@@ -26,81 +23,72 @@ def _convert_image_to_base64(img_url: str):
else:
raise Exception(f"Failed to fetch image from URL: {img_url}, status code: {response.status_code}")
-def make_sf_parser(identifier: str, parser: ParserVersion):
- def alpha_parser(html: str):
- soup = BeautifulSoup(html, "html.parser")
- news_entries = []
- img_processed = 0
- news_links = soup.find_all('a', class_='btn_latestnews')
- for link in news_links:
- try:
- url = link.get('href', '')
- if url.startswith('/'):
- url = urljoin(STREET_FIGHTER_NEWS_SITE, url)
- info_p = link.find('p', class_='info_list_event')
- if not info_p:
- continue
- date_span = info_p.find('span', class_='latestnews_date')
- if not date_span:
- continue
- date_text = date_span.get_text(strip=True)
- date_match = re.match(r'(\d{4}-\d{2}-\d{2})\s+(\d{2}:\d{2})\s* (.+)', date_text)
- if not date_match:
- continue
- date_str = date_match.group(1)
- time_str = date_match.group(2)
- datetime_str = f"{date_str} {time_str}"
- try:
- post_date = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M")
- timestamp = int(post_date.timestamp())
- except ValueError:
- continue
- headline_span = info_p.find('span', class_='info_list_txt')
- headline = headline_span.get_text(strip=True) if headline_span else ""
- headline = re.sub(r'<br\s*/?>', ' ', headline)
- headline = re.sub(r'\s+', ' ', headline).strip()
- images = []
- img_div = link.find('div', class_='image')
- if img_div:
- img_tag = img_div.find('img')
- if img_tag:
- img_src = img_tag.get('src', '')
- if img_src.startswith('/'):
- img_src = urljoin('https://sf6ta.jp', img_src)
- if img_processed <= IMAGE_LIMIT:
- try:
- img_b64 = _convert_image_to_base64(img_src)
- images.append({
- 'image': img_b64,
- 'link': url
- })
- except Exception:
- pass # Failed likely due to 403. Just show no images in that case
- img_processed += 1
- news_entry = {
- 'date': post_date.strftime("%Y-%m-%d %H:%M"),
- 'identifier': identifier,
- 'type': None,
- 'timestamp': timestamp,
- 'headline': None,
- 'content': headline, # content should be prio-ed over headline
- 'url': url,
- 'images': images,
- 'is_ai_summary': False
- }
- news_entries.append(news_entry)
- except Exception as e:
+def parse_sf_news_site(html: str):
+ identifier = "STREET_FIGHTER"
+ soup = BeautifulSoup(html, "html.parser")
+ news_entries = []
+ img_processed = 0
+ news_links = soup.find_all('a', class_='btn_latestnews')
+ for link in news_links:
+ try:
+ url = link.get('href', '')
+ if url.startswith('/'):
+ url = urljoin(STREET_FIGHTER_NEWS_SITE, url)
+ info_p = link.find('p', class_='info_list_event')
+ if not info_p:
continue
+ date_span = info_p.find('span', class_='latestnews_date')
+ if not date_span:
+ continue
+ date_text = date_span.get_text(strip=True)
+ date_match = re.match(r'(\d{4}-\d{2}-\d{2})\s+(\d{2}:\d{2})\s* (.+)', date_text)
+ if not date_match:
+ continue
+ date_str = date_match.group(1)
+ time_str = date_match.group(2)
+ datetime_str = f"{date_str} {time_str}"
+ try:
+ post_date = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M")
+ timestamp = int(post_date.timestamp())
+ except ValueError:
+ continue
+ headline_span = info_p.find('span', class_='info_list_txt')
+ headline = headline_span.get_text(strip=True) if headline_span else ""
+ headline = re.sub(r'<br\s*/?>', ' ', headline)
+ headline = re.sub(r'\s+', ' ', headline).strip()
+ images = []
+ img_div = link.find('div', class_='image')
+ if img_div:
+ img_tag = img_div.find('img')
+ if img_tag:
+ img_src = img_tag.get('src', '')
+ if img_src.startswith('/'):
+ img_src = urljoin('https://sf6ta.jp', img_src)
+ if img_processed <= IMAGE_LIMIT:
+ try:
+ img_b64 = _convert_image_to_base64(img_src)
+ images.append({
+ 'image': img_b64,
+ 'link': url
+ })
+ except Exception:
+ pass # Failed likely due to 403. Just show no images in that case
+ img_processed += 1
+ news_entry = {
+ 'date': post_date.strftime("%Y-%m-%d %H:%M"),
+ 'identifier': identifier,
+ 'type': None,
+ 'timestamp': timestamp,
+ 'headline': None,
+ 'content': headline, # content should be prio-ed over headline
+ 'url': url,
+ 'images': images,
+ 'is_ai_summary': False
+ }
+ news_entries.append(news_entry)
- return news_entries
-
- if parser == ParserVersion.ALPHA:
- return alpha_parser
- else:
- raise ValueError("Unknown Parser Version")
-
+ except Exception:
+ continue
-parse_sf_news_site = make_sf_parser(
- "STREET_FIGHTER", ParserVersion.ALPHA
-)
+ return news_entries \ No newline at end of file
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage