fix: change polaris chord to scrape skeleton html, add date validation

author: Pinapelz <yukais@pinapelz.com> 2025-10-16 11:26:27 -0700
committer: Pinapelz <yukais@pinapelz.com> 2025-10-16 11:26:27 -0700
commit: bcdec514d4823031b39d5c726b371a5bfbefd240 (patch)
tree: 40e1056bcb1d4572ee3caedb67db6080a0af4cb6 /bemani
parent: 9aeca20449d6b4083d53b510ef8ab819bde43846 (diff)
1 files changed, 52 insertions, 27 deletions
diff --git a/bemani/polaris_chord.py b/bemani/polaris_chord.py
index 3b13236..b4c96cd 100644
--- a/bemani/polaris_chord.py
+++ b/bemani/polaris_chord.py
@@ -11,54 +11,79 @@ CATEGORY_MAP = {
     "i_04": "OTHER"
 }
 
-
 def parse_polaris_chord_news_site(html: str) -> list[dict]:
-    base_url = "https://p.eagate.573.jp/"
+    base_url = "https://eacache.s.konaminet.jp/game/polarischord/pc/"
     soup = BeautifulSoup(html, 'html.parser')
     news_list = []
-    for li in soup.select('#info-news li.news'):
-        raw_date = li.find('li', class_='news_date').text.strip()
-        match = re.search(r'(\d{4}/\d{1,2}/\d{1,2})', raw_date)
-        if not match:
+
+    for li in soup.select('li.news'):
+        raw_type = li.get('data-category')
+        post_type = CATEGORY_MAP.get(raw_type, "OTHER")
+
+        # Attempt to extract date from data-date (format: 20251015-01)
+        raw_date = li.get('data-date', '')
+        # Extract first 8 digits: YYYYMMDD
+        if len(raw_date) < 8:
+            continue
+        date_part = raw_date[:8]
+        date_match = re.match(r'(\d{4})(\d{2})(\d{2})', date_part)
+        if not date_match:
+            continue
+        year, month, day = map(int, date_match.groups())
+        if month < 1 or month > 12:
+            continue
+        if day < 1 or day > 31:
             continue
-        date_str = match.group(1)
+        if month in [4, 6, 9, 11] and day > 30:
+            continue
+        if month == 2 and day > 29:
+            continue
+
+        date_str = f"{year}/{month:02}/{day:02}"
 
         try:
-            dt = datetime.strptime(date_str, '%Y/%m/%d')
-        except ValueError:
+            jst = pytz.timezone('Asia/Tokyo')
+            dt_jst = jst.localize(datetime(year, month, day))
+            timestamp = int(dt_jst.timestamp())
+        except (ValueError, OverflowError):
+            # Skip if datetime creation fails
             continue
-        jst = pytz.timezone('Asia/Tokyo')
-        dt_jst = jst.localize(dt)
-        timestamp = int(dt_jst.timestamp())
 
-        raw_type = li.get('data-category')
-        post_type = CATEGORY_MAP.get(raw_type)
+        # Find the news-main ul inside the li
+        news_main = li.find('ul', class_='news-main')
+        if not news_main:
+            continue
 
-        headline = li.find('li', class_='news_title').text.strip()
-        detail = li.find('li', class_='news_detail')
-        content = detail.get_text().strip()
+        # Extract headline from news_title li
+        headline_li = news_main.find('li', class_='news_title')
+        headline_text = headline_li.get_text(strip=True) if headline_li else None
 
-        first_a = detail.find('a', href=True)
-        url = urljoin(base_url, first_a['href']) if first_a else None
+        # Extract content from news_detail li
+        detail_li = news_main.find('li', class_='news_detail')
+        content = detail_li.get_text(strip=True) if detail_li else None
 
+        # Find all images in the detail section
         images = []
-        for img in detail.find_all('img'):
-            img_url = img.get('src')
-            link = None
-            if img.parent.name == 'a' and img.parent.has_attr('href'):
-                link = img.parent['href']
-            images.append({'image': img_url, 'link': link})
+        if detail_li:
+            for img in detail_li.find_all('img'):
+                # Check both src and data-src (for lazy loaded images)
+                img_url = img.get('src') or img.get('data-src')
+                if img_url and not img_url.startswith('http'):
+                    img_url = urljoin(base_url, img_url)
+                if img_url:
+                    images.append({'image': img_url, 'link': None})
 
         entry = {
             'date': date_str,
             'identifier': "POLARIS_CHORD",
             'type': post_type,
             'timestamp': timestamp,
-            'headline': headline,
+            'headline': headline_text,
             'content': content,
-            'url': url,
+            'url': None,
             'images': images,
             'is_ai_summary': False,
         }
         news_list.append(entry)
+
     return news_list
author	Pinapelz <yukais@pinapelz.com>	2025-10-16 11:26:27 -0700
committer	Pinapelz <yukais@pinapelz.com>	2025-10-16 11:26:27 -0700
commit	bcdec514d4823031b39d5c726b371a5bfbefd240 (patch)
tree	40e1056bcb1d4572ee3caedb67db6080a0af4cb6 /bemani
parent	9aeca20449d6b4083d53b510ef8ab819bde43846 (diff)