From bcdec514d4823031b39d5c726b371a5bfbefd240 Mon Sep 17 00:00:00 2001 From: Pinapelz Date: Thu, 16 Oct 2025 11:26:27 -0700 Subject: fix: change polaris chord to scrape skeleton html, add date validation --- bemani/polaris_chord.py | 79 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 27 deletions(-) diff --git a/bemani/polaris_chord.py b/bemani/polaris_chord.py index 3b13236..b4c96cd 100644 --- a/bemani/polaris_chord.py +++ b/bemani/polaris_chord.py @@ -11,54 +11,79 @@ CATEGORY_MAP = { "i_04": "OTHER" } - def parse_polaris_chord_news_site(html: str) -> list[dict]: - base_url = "https://p.eagate.573.jp/" + base_url = "https://eacache.s.konaminet.jp/game/polarischord/pc/" soup = BeautifulSoup(html, 'html.parser') news_list = [] - for li in soup.select('#info-news li.news'): - raw_date = li.find('li', class_='news_date').text.strip() - match = re.search(r'(\d{4}/\d{1,2}/\d{1,2})', raw_date) - if not match: + + for li in soup.select('li.news'): + raw_type = li.get('data-category') + post_type = CATEGORY_MAP.get(raw_type, "OTHER") + + # Attempt to extract date from data-date (format: 20251015-01) + raw_date = li.get('data-date', '') + # Extract first 8 digits: YYYYMMDD + if len(raw_date) < 8: + continue + date_part = raw_date[:8] + date_match = re.match(r'(\d{4})(\d{2})(\d{2})', date_part) + if not date_match: + continue + year, month, day = map(int, date_match.groups()) + if month < 1 or month > 12: + continue + if day < 1 or day > 31: continue - date_str = match.group(1) + if month in [4, 6, 9, 11] and day > 30: + continue + if month == 2 and day > 29: + continue + + date_str = f"{year}/{month:02}/{day:02}" try: - dt = datetime.strptime(date_str, '%Y/%m/%d') - except ValueError: + jst = pytz.timezone('Asia/Tokyo') + dt_jst = jst.localize(datetime(year, month, day)) + timestamp = int(dt_jst.timestamp()) + except (ValueError, OverflowError): + # Skip if datetime creation fails continue - jst = pytz.timezone('Asia/Tokyo') - dt_jst = jst.localize(dt) - timestamp = int(dt_jst.timestamp()) - raw_type = li.get('data-category') - post_type = CATEGORY_MAP.get(raw_type) + # Find the news-main ul inside the li + news_main = li.find('ul', class_='news-main') + if not news_main: + continue - headline = li.find('li', class_='news_title').text.strip() - detail = li.find('li', class_='news_detail') - content = detail.get_text().strip() + # Extract headline from news_title li + headline_li = news_main.find('li', class_='news_title') + headline_text = headline_li.get_text(strip=True) if headline_li else None - first_a = detail.find('a', href=True) - url = urljoin(base_url, first_a['href']) if first_a else None + # Extract content from news_detail li + detail_li = news_main.find('li', class_='news_detail') + content = detail_li.get_text(strip=True) if detail_li else None + # Find all images in the detail section images = [] - for img in detail.find_all('img'): - img_url = img.get('src') - link = None - if img.parent.name == 'a' and img.parent.has_attr('href'): - link = img.parent['href'] - images.append({'image': img_url, 'link': link}) + if detail_li: + for img in detail_li.find_all('img'): + # Check both src and data-src (for lazy loaded images) + img_url = img.get('src') or img.get('data-src') + if img_url and not img_url.startswith('http'): + img_url = urljoin(base_url, img_url) + if img_url: + images.append({'image': img_url, 'link': None}) entry = { 'date': date_str, 'identifier': "POLARIS_CHORD", 'type': post_type, 'timestamp': timestamp, - 'headline': headline, + 'headline': headline_text, 'content': content, - 'url': url, + 'url': None, 'images': images, 'is_ai_summary': False, } news_list.append(entry) + return news_list -- cgit v1.2.3