1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
|
import json
from bs4 import BeautifulSoup
import re
from datetime import datetime
from urllib.parse import urljoin
from constants import STREET_FIGHTER_NEWS_SITE
import requests
import base64
IMAGE_LIMIT = 10 # only allow 10 images to be processed as b64 is expensive to store
def _convert_image_to_base64(img_url: str):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(img_url, headers=headers)
if response.status_code == 200:
img_data = response.content
img_base64 = base64.b64encode(img_data).decode('utf-8')
mime_type = response.headers['Content-Type']
return f"data:{mime_type};base64,{img_base64}"
else:
raise Exception(f"Failed to fetch image from URL: {img_url}, status code: {response.status_code}")
def parse_sf_news_site(html: str):
identifier = "STREET_FIGHTER"
soup = BeautifulSoup(html, "html.parser")
news_entries = []
img_processed = 0
news_links = soup.find_all('a', class_='btn_latestnews')
for link in news_links:
try:
url = link.get('href', '')
if url.startswith('/'):
url = urljoin(STREET_FIGHTER_NEWS_SITE, url)
info_p = link.find('p', class_='info_list_event')
if not info_p:
continue
date_span = info_p.find('span', class_='latestnews_date')
if not date_span:
continue
date_text = date_span.get_text(strip=True)
date_match = re.match(r'(\d{4}-\d{2}-\d{2})\s+(\d{2}:\d{2})\s* (.+)', date_text)
if not date_match:
continue
date_str = date_match.group(1)
time_str = date_match.group(2)
datetime_str = f"{date_str} {time_str}"
try:
post_date = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M")
timestamp = int(post_date.timestamp())
except ValueError:
continue
headline_span = info_p.find('span', class_='info_list_txt')
headline = headline_span.get_text(strip=True) if headline_span else ""
headline = re.sub(r'<br\s*/?>', ' ', headline)
headline = re.sub(r'\s+', ' ', headline).strip()
images = []
img_div = link.find('div', class_='image')
if img_div:
img_tag = img_div.find('img')
if img_tag:
img_src = img_tag.get('src', '')
if img_src.startswith('/'):
img_src = urljoin('https://sf6ta.jp', img_src)
if img_processed <= IMAGE_LIMIT:
try:
img_b64 = _convert_image_to_base64(img_src)
images.append({
'image': img_b64,
'link': url
})
except Exception:
pass # Failed likely due to 403. Just show no images in that case
img_processed += 1
news_entry = {
'date': post_date.strftime("%Y-%m-%d %H:%M"),
'identifier': identifier,
'type': None,
'timestamp': timestamp,
'headline': None,
'content': headline, # content should be prio-ed over headline
'url': url,
'images': images,
'is_ai_summary': False
}
news_entries.append(news_entry)
except Exception:
continue
return news_entries
|