aboutsummaryrefslogtreecommitdiffstats
path: root/taito
diff options
context:
space:
mode:
Diffstat (limited to 'taito')
-rw-r--r--taito/__init__.py7
-rw-r--r--taito/music_diver.py3
-rw-r--r--taito/street_fighter.py142
3 files changed, 74 insertions, 78 deletions
diff --git a/taito/__init__.py b/taito/__init__.py
new file mode 100644
index 0000000..bc55d25
--- /dev/null
+++ b/taito/__init__.py
@@ -0,0 +1,7 @@
+from taito.music_diver import parse_music_diver_news_json
+from taito.street_fighter import parse_sf_news_site
+
+__all__ = [
+ "parse_music_diver_news_json",
+ "parse_sf_news_site",
+] \ No newline at end of file
diff --git a/taito/music_diver.py b/taito/music_diver.py
index 5469ad5..efab0b0 100644
--- a/taito/music_diver.py
+++ b/taito/music_diver.py
@@ -52,6 +52,7 @@ def parse_music_diver_news_json(data_str: str):
"headline": post["title"],
"content": content,
"url": None,
- "images": images
+ "images": images,
+ "is_ai_summary": False
})
return news_posts
diff --git a/taito/street_fighter.py b/taito/street_fighter.py
index 987b72b..bf58090 100644
--- a/taito/street_fighter.py
+++ b/taito/street_fighter.py
@@ -3,15 +3,12 @@ from bs4 import BeautifulSoup
import re
from datetime import datetime
from urllib.parse import urljoin
-from enum import Enum
from constants import STREET_FIGHTER_NEWS_SITE
import requests
import base64
IMAGE_LIMIT = 10 # only allow 10 images to be processed as b64 is expensive to store
-class ParserVersion(Enum):
- ALPHA = 1
def _convert_image_to_base64(img_url: str):
headers = {
@@ -26,81 +23,72 @@ def _convert_image_to_base64(img_url: str):
else:
raise Exception(f"Failed to fetch image from URL: {img_url}, status code: {response.status_code}")
-def make_sf_parser(identifier: str, parser: ParserVersion):
- def alpha_parser(html: str):
- soup = BeautifulSoup(html, "html.parser")
- news_entries = []
- img_processed = 0
- news_links = soup.find_all('a', class_='btn_latestnews')
- for link in news_links:
- try:
- url = link.get('href', '')
- if url.startswith('/'):
- url = urljoin(STREET_FIGHTER_NEWS_SITE, url)
- info_p = link.find('p', class_='info_list_event')
- if not info_p:
- continue
- date_span = info_p.find('span', class_='latestnews_date')
- if not date_span:
- continue
- date_text = date_span.get_text(strip=True)
- date_match = re.match(r'(\d{4}-\d{2}-\d{2})\s+(\d{2}:\d{2})\s* (.+)', date_text)
- if not date_match:
- continue
- date_str = date_match.group(1)
- time_str = date_match.group(2)
- datetime_str = f"{date_str} {time_str}"
- try:
- post_date = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M")
- timestamp = int(post_date.timestamp())
- except ValueError:
- continue
- headline_span = info_p.find('span', class_='info_list_txt')
- headline = headline_span.get_text(strip=True) if headline_span else ""
- headline = re.sub(r'<br\s*/?>', ' ', headline)
- headline = re.sub(r'\s+', ' ', headline).strip()
- images = []
- img_div = link.find('div', class_='image')
- if img_div:
- img_tag = img_div.find('img')
- if img_tag:
- img_src = img_tag.get('src', '')
- if img_src.startswith('/'):
- img_src = urljoin('https://sf6ta.jp', img_src)
- if img_processed <= IMAGE_LIMIT:
- try:
- img_b64 = _convert_image_to_base64(img_src)
- images.append({
- 'image': img_b64,
- 'link': url
- })
- except Exception:
- pass # Failed likely due to 403. Just show no images in that case
- img_processed += 1
- news_entry = {
- 'date': post_date.strftime("%Y-%m-%d %H:%M"),
- 'identifier': identifier,
- 'type': None,
- 'timestamp': timestamp,
- 'headline': None,
- 'content': headline, # content should be prio-ed over headline
- 'url': url,
- 'images': images,
- 'is_ai_summary': False
- }
- news_entries.append(news_entry)
- except Exception as e:
+def parse_sf_news_site(html: str):
+ identifier = "STREET_FIGHTER"
+ soup = BeautifulSoup(html, "html.parser")
+ news_entries = []
+ img_processed = 0
+ news_links = soup.find_all('a', class_='btn_latestnews')
+ for link in news_links:
+ try:
+ url = link.get('href', '')
+ if url.startswith('/'):
+ url = urljoin(STREET_FIGHTER_NEWS_SITE, url)
+ info_p = link.find('p', class_='info_list_event')
+ if not info_p:
continue
+ date_span = info_p.find('span', class_='latestnews_date')
+ if not date_span:
+ continue
+ date_text = date_span.get_text(strip=True)
+ date_match = re.match(r'(\d{4}-\d{2}-\d{2})\s+(\d{2}:\d{2})\s* (.+)', date_text)
+ if not date_match:
+ continue
+ date_str = date_match.group(1)
+ time_str = date_match.group(2)
+ datetime_str = f"{date_str} {time_str}"
+ try:
+ post_date = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M")
+ timestamp = int(post_date.timestamp())
+ except ValueError:
+ continue
+ headline_span = info_p.find('span', class_='info_list_txt')
+ headline = headline_span.get_text(strip=True) if headline_span else ""
+ headline = re.sub(r'<br\s*/?>', ' ', headline)
+ headline = re.sub(r'\s+', ' ', headline).strip()
+ images = []
+ img_div = link.find('div', class_='image')
+ if img_div:
+ img_tag = img_div.find('img')
+ if img_tag:
+ img_src = img_tag.get('src', '')
+ if img_src.startswith('/'):
+ img_src = urljoin('https://sf6ta.jp', img_src)
+ if img_processed <= IMAGE_LIMIT:
+ try:
+ img_b64 = _convert_image_to_base64(img_src)
+ images.append({
+ 'image': img_b64,
+ 'link': url
+ })
+ except Exception:
+ pass # Failed likely due to 403. Just show no images in that case
+ img_processed += 1
+ news_entry = {
+ 'date': post_date.strftime("%Y-%m-%d %H:%M"),
+ 'identifier': identifier,
+ 'type': None,
+ 'timestamp': timestamp,
+ 'headline': None,
+ 'content': headline, # content should be prio-ed over headline
+ 'url': url,
+ 'images': images,
+ 'is_ai_summary': False
+ }
+ news_entries.append(news_entry)
- return news_entries
-
- if parser == ParserVersion.ALPHA:
- return alpha_parser
- else:
- raise ValueError("Unknown Parser Version")
-
+ except Exception:
+ continue
-parse_sf_news_site = make_sf_parser(
- "STREET_FIGHTER", ParserVersion.ALPHA
-)
+ return news_entries \ No newline at end of file
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage