taito/street_fighter.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94

import json
from bs4 import BeautifulSoup
import re
from datetime import datetime
from urllib.parse import urljoin
from constants import STREET_FIGHTER_NEWS_SITE
import requests
import base64

IMAGE_LIMIT = 10 # only allow 10 images to be processed as b64 is expensive to store


def _convert_image_to_base64(img_url: str):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(img_url, headers=headers)
    if response.status_code == 200:
        img_data = response.content
        img_base64 = base64.b64encode(img_data).decode('utf-8')
        mime_type = response.headers['Content-Type']
        return f"data:{mime_type};base64,{img_base64}"
    else:
        raise Exception(f"Failed to fetch image from URL: {img_url}, status code: {response.status_code}")


def parse_sf_news_site(html: str):
    identifier = "STREET_FIGHTER"
    soup = BeautifulSoup(html, "html.parser")
    news_entries = []
    img_processed = 0
    news_links = soup.find_all('a', class_='btn_latestnews')
    for link in news_links:
        try:
            url = link.get('href', '')
            if url.startswith('/'):
                url = urljoin(STREET_FIGHTER_NEWS_SITE, url)
            info_p = link.find('p', class_='info_list_event')
            if not info_p:
                continue
            date_span = info_p.find('span', class_='latestnews_date')
            if not date_span:
                continue
            date_text = date_span.get_text(strip=True)
            date_match = re.match(r'(\d{4}-\d{2}-\d{2})\s+(\d{2}:\d{2})\s*　(.+)', date_text)
            if not date_match:
                continue
            date_str = date_match.group(1)
            time_str = date_match.group(2)
            datetime_str = f"{date_str} {time_str}"
            try:
                post_date = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M")
                timestamp = int(post_date.timestamp())
            except ValueError:
                continue
            headline_span = info_p.find('span', class_='info_list_txt')
            headline = headline_span.get_text(strip=True) if headline_span else ""
            headline = re.sub(r'<br\s*/?>', ' ', headline)
            headline = re.sub(r'\s+', ' ', headline).strip()
            images = []
            img_div = link.find('div', class_='image')
            if img_div:
                img_tag = img_div.find('img')
                if img_tag:
                    img_src = img_tag.get('src', '')
                    if img_src.startswith('/'):
                        img_src = urljoin('https://sf6ta.jp', img_src)
                    if img_processed <= IMAGE_LIMIT:
                        try:
                            img_b64 = _convert_image_to_base64(img_src)
                            images.append({
                                'image': img_b64,
                                'link': url
                            })
                        except Exception:
                            pass  # Failed likely due to 403. Just show no images in that case
                        img_processed += 1
            news_entry = {
                'date': post_date.strftime("%Y-%m-%d %H:%M"),
                'identifier': identifier,
                'type': None,
                'timestamp': timestamp,
                'headline': None,
                'content': headline,  # content should be prio-ed over headline
                'url': url,
                'images': images,
                'is_ai_summary': False
            }
            news_entries.append(news_entry)

        except Exception:
            continue

    return news_entries