diff options
| -rw-r--r-- | README.md | 1 | ||||
| -rw-r--r-- | constants.py | 4 | ||||
| -rw-r--r-- | generate.py | 5 | ||||
| -rw-r--r-- | news_feed.py | 8 | ||||
| -rw-r--r-- | taito/street_fighter.py | 82 |
5 files changed, 100 insertions, 0 deletions
@@ -25,4 +25,5 @@ Currently Supported: - maimai DX (INTL) (`maimaidx_intl_news`) - O.N.G.E.K.I (JPN) (`ongeki_jp_news`) - MUSIC DIVER (`music_diver_news`) +- STREET FIGHTER TYPE ARCADE (`street_fighter_news`) - Taiko no Tatsujin (`taiko_news`) -> Only official blog title and headings diff --git a/constants.py b/constants.py index e6b7e57..e1b0369 100644 --- a/constants.py +++ b/constants.py @@ -25,6 +25,7 @@ MAIMAIDX_INTL_NEWS_SITE="https://maimai.sega.com/download/" ONGEKI_JP_NEWS_SITE="https://info-ongeki.sega.jp/" MUSIC_DIVER_NEWS="https://mypage.musicdiver.jp/api/news?lang=en" +STREET_FIGHTER_NEWS_SITE="https://sf6ta.jp/info/list" TAIKO_BLOG_SITE="https://taiko-ch.net/blog/" WANGAN_MAXI_GENERIC="https://wanganmaxi-official.com/" @@ -58,3 +59,6 @@ class WANGAN_MAXI_VERSION(Enum): SIX_R = 3, SIX_RR = 4, SIX_RR_PLUS = 5 + +class STREET_FIGHTER_VERSION(Enum): + SIX = 1, diff --git a/generate.py b/generate.py index 2cc27d1..e94384e 100644 --- a/generate.py +++ b/generate.py @@ -125,6 +125,9 @@ def generate_chunithm_intl_news_file(): def generate_music_diver_news_file(): return generate_news_file("music_diver_news", constants.MUSIC_DIVER_NEWS) +def generate_street_fighter_news_file(): + return generate_news_file("street_fighter_news", constants.STREET_FIGHTER_NEWS_SITE, constants.STREET_FIGHTER_VERSION.SIX) + def generate_taiko_news_file(): return generate_news_file("taiko_news", constants.TAIKO_BLOG_SITE) @@ -161,6 +164,7 @@ if __name__ == "__main__": maimaidx_intl_news_data = generate_maimaidx_intl_news_file() chunithm_intl_news_data = generate_chunithm_intl_news_file() music_diver_news_data = generate_music_diver_news_file() + street_fighter_news_data = generate_street_fighter_news_file() taiko_news_data = generate_taiko_news_file() wacca_plus_news = generate_wacca_plus_news_file() museca_plus_news = generate_museca_plus_news_file() @@ -183,6 +187,7 @@ if __name__ == "__main__": maimaidx_intl_news_data, chunithm_intl_news_data, music_diver_news_data, + street_fighter_news_data, taiko_news_data, wacca_plus_news, museca_plus_news, diff --git a/news_feed.py b/news_feed.py index fb11c6c..7c781db 100644 --- a/news_feed.py +++ b/news_feed.py @@ -31,6 +31,7 @@ import sega.maimaidx_jp as maimaidx_jp import sega.maimaidx_intl as maimaidx_intl import sega.ongeki_jp as ongeki_jp import taito.music_diver as music_diver +import taito.street_fighter as street_fighter import bandai_namco.taiko as taiko import bandai_namco.wmmt as wmmt import community.disc as disc @@ -162,6 +163,13 @@ def get_news(news_url: str, version=None) -> list: api_data = download_site_as_html(news_url) news_posts = sorted(music_diver.parse_music_diver_news_json(api_data), key=lambda x: x['timestamp'], reverse=True) + elif news_url == constants.STREET_FIGHTER_NEWS_SITE: + site_data = download_site_as_html(news_url) + news_posts = sorted(street_fighter.parse_sf_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) + print(news_posts) + exit() + + elif news_url == constants.TAIKO_BLOG_SITE: site_data = download_site_as_html(news_url) news_posts = sorted(taiko.parse_taiko_blog_site(site_data), key=lambda x: x['timestamp'], reverse=True) diff --git a/taito/street_fighter.py b/taito/street_fighter.py new file mode 100644 index 0000000..1da80bd --- /dev/null +++ b/taito/street_fighter.py @@ -0,0 +1,82 @@ +import json +from bs4 import BeautifulSoup +import re +from datetime import datetime +from urllib.parse import urljoin +from enum import Enum +from constants import STREET_FIGHTER_NEWS_SITE + +class ParserVersion(Enum): + ALPHA = 1 + +def make_sf_parser(identifier: str, parser: ParserVersion): + def alpha_parser(html: str): + soup = BeautifulSoup(html, "html.parser") + news_entries = [] + news_links = soup.find_all('a', class_='btn_latestnews') + for link in news_links: + try: + url = link.get('href', '') + if url.startswith('/'): + url = urljoin(STREET_FIGHTER_NEWS_SITE, url) + info_p = link.find('p', class_='info_list_event') + if not info_p: + continue + date_span = info_p.find('span', class_='latestnews_date') + if not date_span: + continue + date_text = date_span.get_text(strip=True) + date_match = re.match(r'(\d{4}-\d{2}-\d{2})\s+(\d{2}:\d{2})\s* (.+)', date_text) + if not date_match: + continue + date_str = date_match.group(1) + time_str = date_match.group(2) + datetime_str = f"{date_str} {time_str}" + try: + post_date = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M") + timestamp = int(post_date.timestamp()) + except ValueError: + continue + headline_span = info_p.find('span', class_='info_list_txt') + headline = headline_span.get_text(strip=True) if headline_span else "" + headline = re.sub(r'<br\s*/?>', ' ', headline) + headline = re.sub(r'\s+', ' ', headline).strip() + images = [] + img_div = link.find('div', class_='image') + if img_div: + img_tag = img_div.find('img') + if img_tag: + img_src = img_tag.get('src', '') + if img_src.startswith('/'): + img_src = urljoin('https://sf6ta.jp', img_src) + images.append({ + 'image': img_src, + 'link': url + }) + news_entry = { + 'date': post_date.strftime("%Y-%m-%d %H:%M"), + 'identifier': identifier, + 'type': None, + 'timestamp': timestamp, + 'headline': None, + 'content': headline, # content should be prio-ed over headline + 'url': url, + 'images': images, + 'is_ai_summary': False + } + news_entries.append(news_entry) + + except Exception as e: + continue + + return news_entries + + if parser == ParserVersion.ALPHA: + return alpha_parser + else: + raise ValueError("Unknown Parser Version") + + +parse_sf_news_site = make_sf_parser( + "STREET_FIGHTER", ParserVersion.ALPHA +) |
