1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
|
import json
from bs4 import BeautifulSoup
import re
from datetime import datetime
from urllib.parse import urljoin
from constants import IDAC_NEWS_SITE
def parse_idac_news_site(site_data: str):
soup = BeautifulSoup(site_data, "html.parser")
news_entries = []
articles = soup.find_all('article', class_=lambda x: x and 'post-' in x)
for article in articles:
try:
post_id = None
for cls in article.get('class', []):
if cls.startswith('post-') and cls[5:].isdigit():
post_id = cls[5:]
break
if not post_id:
continue
title_section = article.find('h1', class_='entry-title')
if not title_section:
continue
news_title_link = title_section.find('a', class_='news-title')
if not news_title_link:
continue
url = news_title_link.get('href', '')
headline = news_title_link.get_text(strip=True)
date_span = title_section.find('span', class_='entry_date')
if not date_span:
continue
date_text = date_span.get_text(strip=True)
date_match = re.match(r'(\d{4})年(\d{1,2})月(\d{1,2})日', date_text)
if not date_match:
continue
year = int(date_match.group(1))
month = int(date_match.group(2))
day = int(date_match.group(3))
# Create datetime object (assuming JST timezone, noon time)
try:
post_date = datetime(year, month, day, 12, 0)
timestamp = int(post_date.timestamp())
except ValueError:
continue
post_type = None
categories_list = title_section.find('ul', class_='post-categories')
if categories_list:
category_link = categories_list.find('a')
if category_link:
post_type = category_link.get_text(strip=True)
content = ""
entry_summary = article.find('div', class_='entry-summary')
if entry_summary:
content = entry_summary.get_text(strip=True)
content = re.sub(r'続きを読む\s*.*$', '', content).strip()
content = re.sub(r'\s*…\s*$', '', content).strip()
images = []
img_tags = article.find_all('img')
for img in img_tags:
img_src = img.get('src', '')
if img_src and not img_src.endswith('.svg'): # Skip icon/UI images
if img_src.startswith('/'):
img_src = urljoin('https://info-initialdac.sega.jp', img_src)
images.append({
'image': img_src,
'link': url
})
news_entry = {
'date': post_date.strftime("%Y-%m-%d %H:%M"),
'identifier': "IDAC_NEWS",
'type': post_type,
'timestamp': timestamp,
'headline': headline,
'content': content if content else headline,
'url': url,
'images': images,
'is_ai_summary': False
}
news_entries.append(news_entry)
except Exception as e:
# Skip malformed entries
continue
return news_entries
def get_promo_image(site_data: str) -> str:
soup = BeautifulSoup(site_data, "html.parser")
entry_content = soup.find('div', class_='entry-content')
if entry_content:
img_tag = entry_content.find('img')
if img_tag:
return img_tag.get('src', '')
return ''
|