diff options
| author | Pinapelz <yukais@pinapelz.com> | 2026-03-12 13:56:30 -0700 |
|---|---|---|
| committer | Pinapelz <yukais@pinapelz.com> | 2026-03-12 13:56:50 -0700 |
| commit | caa3cf245186ab0f6fb33e63a7dd838d834da12e (patch) | |
| tree | bc5742a134ecabf0b9d35cc12b1d6f67defd5da7 /taito | |
| parent | 5658441ab9b703c95a48e654d41e45cc3a55ffd3 (diff) | |
refactor: move to common NewsSource interface
cleanup imports by defining initaliazers modules and decorator
remove legacy scrapers
remove single factory for sega games (sites don't change that much)
Diffstat (limited to 'taito')
| -rw-r--r-- | taito/__init__.py | 7 | ||||
| -rw-r--r-- | taito/music_diver.py | 3 | ||||
| -rw-r--r-- | taito/street_fighter.py | 142 |
3 files changed, 74 insertions, 78 deletions
diff --git a/taito/__init__.py b/taito/__init__.py new file mode 100644 index 0000000..bc55d25 --- /dev/null +++ b/taito/__init__.py @@ -0,0 +1,7 @@ +from taito.music_diver import parse_music_diver_news_json +from taito.street_fighter import parse_sf_news_site + +__all__ = [ + "parse_music_diver_news_json", + "parse_sf_news_site", +]
\ No newline at end of file diff --git a/taito/music_diver.py b/taito/music_diver.py index 5469ad5..efab0b0 100644 --- a/taito/music_diver.py +++ b/taito/music_diver.py @@ -52,6 +52,7 @@ def parse_music_diver_news_json(data_str: str): "headline": post["title"], "content": content, "url": None, - "images": images + "images": images, + "is_ai_summary": False }) return news_posts diff --git a/taito/street_fighter.py b/taito/street_fighter.py index 987b72b..bf58090 100644 --- a/taito/street_fighter.py +++ b/taito/street_fighter.py @@ -3,15 +3,12 @@ from bs4 import BeautifulSoup import re from datetime import datetime from urllib.parse import urljoin -from enum import Enum from constants import STREET_FIGHTER_NEWS_SITE import requests import base64 IMAGE_LIMIT = 10 # only allow 10 images to be processed as b64 is expensive to store -class ParserVersion(Enum): - ALPHA = 1 def _convert_image_to_base64(img_url: str): headers = { @@ -26,81 +23,72 @@ def _convert_image_to_base64(img_url: str): else: raise Exception(f"Failed to fetch image from URL: {img_url}, status code: {response.status_code}") -def make_sf_parser(identifier: str, parser: ParserVersion): - def alpha_parser(html: str): - soup = BeautifulSoup(html, "html.parser") - news_entries = [] - img_processed = 0 - news_links = soup.find_all('a', class_='btn_latestnews') - for link in news_links: - try: - url = link.get('href', '') - if url.startswith('/'): - url = urljoin(STREET_FIGHTER_NEWS_SITE, url) - info_p = link.find('p', class_='info_list_event') - if not info_p: - continue - date_span = info_p.find('span', class_='latestnews_date') - if not date_span: - continue - date_text = date_span.get_text(strip=True) - date_match = re.match(r'(\d{4}-\d{2}-\d{2})\s+(\d{2}:\d{2})\s* (.+)', date_text) - if not date_match: - continue - date_str = date_match.group(1) - time_str = date_match.group(2) - datetime_str = f"{date_str} {time_str}" - try: - post_date = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M") - timestamp = int(post_date.timestamp()) - except ValueError: - continue - headline_span = info_p.find('span', class_='info_list_txt') - headline = headline_span.get_text(strip=True) if headline_span else "" - headline = re.sub(r'<br\s*/?>', ' ', headline) - headline = re.sub(r'\s+', ' ', headline).strip() - images = [] - img_div = link.find('div', class_='image') - if img_div: - img_tag = img_div.find('img') - if img_tag: - img_src = img_tag.get('src', '') - if img_src.startswith('/'): - img_src = urljoin('https://sf6ta.jp', img_src) - if img_processed <= IMAGE_LIMIT: - try: - img_b64 = _convert_image_to_base64(img_src) - images.append({ - 'image': img_b64, - 'link': url - }) - except Exception: - pass # Failed likely due to 403. Just show no images in that case - img_processed += 1 - news_entry = { - 'date': post_date.strftime("%Y-%m-%d %H:%M"), - 'identifier': identifier, - 'type': None, - 'timestamp': timestamp, - 'headline': None, - 'content': headline, # content should be prio-ed over headline - 'url': url, - 'images': images, - 'is_ai_summary': False - } - news_entries.append(news_entry) - except Exception as e: +def parse_sf_news_site(html: str): + identifier = "STREET_FIGHTER" + soup = BeautifulSoup(html, "html.parser") + news_entries = [] + img_processed = 0 + news_links = soup.find_all('a', class_='btn_latestnews') + for link in news_links: + try: + url = link.get('href', '') + if url.startswith('/'): + url = urljoin(STREET_FIGHTER_NEWS_SITE, url) + info_p = link.find('p', class_='info_list_event') + if not info_p: continue + date_span = info_p.find('span', class_='latestnews_date') + if not date_span: + continue + date_text = date_span.get_text(strip=True) + date_match = re.match(r'(\d{4}-\d{2}-\d{2})\s+(\d{2}:\d{2})\s* (.+)', date_text) + if not date_match: + continue + date_str = date_match.group(1) + time_str = date_match.group(2) + datetime_str = f"{date_str} {time_str}" + try: + post_date = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M") + timestamp = int(post_date.timestamp()) + except ValueError: + continue + headline_span = info_p.find('span', class_='info_list_txt') + headline = headline_span.get_text(strip=True) if headline_span else "" + headline = re.sub(r'<br\s*/?>', ' ', headline) + headline = re.sub(r'\s+', ' ', headline).strip() + images = [] + img_div = link.find('div', class_='image') + if img_div: + img_tag = img_div.find('img') + if img_tag: + img_src = img_tag.get('src', '') + if img_src.startswith('/'): + img_src = urljoin('https://sf6ta.jp', img_src) + if img_processed <= IMAGE_LIMIT: + try: + img_b64 = _convert_image_to_base64(img_src) + images.append({ + 'image': img_b64, + 'link': url + }) + except Exception: + pass # Failed likely due to 403. Just show no images in that case + img_processed += 1 + news_entry = { + 'date': post_date.strftime("%Y-%m-%d %H:%M"), + 'identifier': identifier, + 'type': None, + 'timestamp': timestamp, + 'headline': None, + 'content': headline, # content should be prio-ed over headline + 'url': url, + 'images': images, + 'is_ai_summary': False + } + news_entries.append(news_entry) - return news_entries - - if parser == ParserVersion.ALPHA: - return alpha_parser - else: - raise ValueError("Unknown Parser Version") - + except Exception: + continue -parse_sf_news_site = make_sf_parser( - "STREET_FIGHTER", ParserVersion.ALPHA -) + return news_entries
\ No newline at end of file |
