sega/idac.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103

import json
from bs4 import BeautifulSoup
import re
from datetime import datetime
from urllib.parse import urljoin
from constants import IDAC_NEWS_SITE


def parse_idac_news_site(site_data: str):
    soup = BeautifulSoup(site_data, "html.parser")
    news_entries = []
    articles = soup.find_all('article', class_=lambda x: x and 'post-' in x)
    for article in articles:
        try:
            post_id = None
            for cls in article.get('class', []):
                if cls.startswith('post-') and cls[5:].isdigit():
                    post_id = cls[5:]
                    break

            if not post_id:
                continue
            title_section = article.find('h1', class_='entry-title')
            if not title_section:
                continue
            news_title_link = title_section.find('a', class_='news-title')
            if not news_title_link:
                continue

            url = news_title_link.get('href', '')
            headline = news_title_link.get_text(strip=True)
            date_span = title_section.find('span', class_='entry_date')
            if not date_span:
                continue

            date_text = date_span.get_text(strip=True)

            date_match = re.match(r'(\d{4})年(\d{1,2})月(\d{1,2})日', date_text)
            if not date_match:
                continue

            year = int(date_match.group(1))
            month = int(date_match.group(2))
            day = int(date_match.group(3))

            # Create datetime object (assuming JST timezone, noon time)
            try:
                post_date = datetime(year, month, day, 12, 0)
                timestamp = int(post_date.timestamp())
            except ValueError:
                continue
            post_type = None
            categories_list = title_section.find('ul', class_='post-categories')
            if categories_list:
                category_link = categories_list.find('a')
                if category_link:
                    post_type = category_link.get_text(strip=True)
            content = ""
            entry_summary = article.find('div', class_='entry-summary')
            if entry_summary:
                content = entry_summary.get_text(strip=True)
                content = re.sub(r'続きを読む\s*.*$', '', content).strip()
                content = re.sub(r'\s*…\s*$', '', content).strip()
            images = []
            img_tags = article.find_all('img')
            for img in img_tags:
                img_src = img.get('src', '')
                if img_src and not img_src.endswith('.svg'):  # Skip icon/UI images
                    if img_src.startswith('/'):
                        img_src = urljoin('https://info-initialdac.sega.jp', img_src)
                    images.append({
                        'image': img_src,
                        'link': url
                    })
            news_entry = {
                'date': post_date.strftime("%Y-%m-%d %H:%M"),
                'identifier': "IDAC_NEWS",
                'type': post_type,
                'timestamp': timestamp,
                'headline': headline,
                'content': content if content else headline,
                'url': url,
                'images': images,
                'is_ai_summary': False
            }

            news_entries.append(news_entry)

        except Exception as e:
            # Skip malformed entries
            continue

    return news_entries


def get_promo_image(site_data: str) -> str:
    soup = BeautifulSoup(site_data, "html.parser")
    entry_content = soup.find('div', class_='entry-content')
    if entry_content:
        img_tag = entry_content.find('img')
        if img_tag:
            return img_tag.get('src', '')
    return ''