1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
from bs4 import BeautifulSoup
from datetime import datetime
import pytz
from urllib.parse import urljoin
import re
CATEGORY_MAP = {
"i_01": "NEWS",
"i_02": "MUSIC",
"i_03": "EVENT",
"i_04": "OTHER"
}
def parse_polaris_chord_news_site(html: str, limit: int) -> list[dict]:
base_url = "https://eacache.s.konaminet.jp/game/polarischord/pc/"
soup = BeautifulSoup(html, 'html.parser')
news_list = []
for li in soup.select('li.news'):
# Check if we've reached the limit
if len(news_list) >= limit:
break
raw_type = li.get('data-category')
post_type = CATEGORY_MAP.get(raw_type, "OTHER")
# Attempt to extract date from data-date (format: 20251015-01)
raw_date = li.get('data-date', '')
# Extract first 8 digits: YYYYMMDD
if len(raw_date) < 8:
continue
date_part = raw_date[:8]
date_match = re.match(r'(\d{4})(\d{2})(\d{2})', date_part)
if not date_match:
continue
year, month, day = map(int, date_match.groups())
if month < 1 or month > 12:
continue
if day < 1 or day > 31:
continue
if month in [4, 6, 9, 11] and day > 30:
continue
if month == 2 and day > 29:
continue
date_str = f"{year}/{month:02}/{day:02}"
try:
jst = pytz.timezone('Asia/Tokyo')
dt_jst = jst.localize(datetime(year, month, day))
timestamp = int(dt_jst.timestamp())
except (ValueError, OverflowError):
# Skip if datetime creation fails
continue
# Find the news-main ul inside the li
news_main = li.find('ul', class_='news-main')
if not news_main:
continue
# Extract headline from news_title li
headline_li = news_main.find('li', class_='news_title')
headline_text = headline_li.get_text(strip=True) if headline_li else None
# Extract content from news_detail li
detail_li = news_main.find('li', class_='news_detail')
content = detail_li.get_text(strip=True) if detail_li else None
# Find all images in the detail section
images = []
if detail_li:
for img in detail_li.find_all('img'):
# Check both src and data-src (for lazy loaded images)
img_url = img.get('src') or img.get('data-src')
if img_url and not img_url.startswith('http'):
img_url = urljoin(base_url, img_url)
if img_url:
images.append({'image': img_url, 'link': None})
entry = {
'date': date_str,
'identifier': "POLARIS_CHORD",
'type': post_type,
'timestamp': timestamp,
'headline': headline_text,
'content': content,
'url': None,
'images': images,
'is_ai_summary': False,
}
news_list.append(entry)
return news_list
|