add support for scraping polaris chord

author: Pinapelz <yukais@pinapelz.com> 2025-04-28 10:23:29 -0700
committer: Pinapelz <yukais@pinapelz.com> 2025-04-28 10:23:29 -0700
commit: 638c964b7dba51b0f86c4d1f562a77e2cdb49437 (patch)
tree: 9bedc2b587439051d362089e4ec05939a0ccc32d /bemani
parent: 4852c740b0e967429f61228511af18ea25a77c12 (diff)
2 files changed, 65 insertions, 1 deletions
diff --git a/bemani/ddr.py b/bemani/ddr.py
index e9d4584..9651c48 100644
--- a/bemani/ddr.py
+++ b/bemani/ddr.py
@@ -1,8 +1,10 @@
+"""
+Currently unused as e-eamusement app feed is favored. Here for archival purposes
+"""
 from bs4 import BeautifulSoup
 from datetime import datetime
 from urllib.parse import urljoin
 import time
-import re
 
 def parse_ddr_world_news_site(html: str):
     base_url = "https://p.eagate.573.jp"
diff --git a/bemani/polaris_chord.py b/bemani/polaris_chord.py
new file mode 100644
index 0000000..2880dae
--- /dev/null
+++ b/bemani/polaris_chord.py
@@ -0,0 +1,62 @@
+from bs4 import BeautifulSoup
+from datetime import datetime
+import pytz
+import re
+
+CATEGORY_MAP = {
+    "i_01": "NEWS",
+    "i_02": "MUSIC",
+    "i_03": "EVENT",
+    "i_04": "OTHER"
+}
+
+
+def parse_polaris_chord_news_site(html: str) -> list[dict]:
+    soup = BeautifulSoup(html, 'html.parser')
+    news_list = []
+    for li in soup.select('#info-news li.news'):
+        raw_date = li.find('li', class_='news_date').text.strip()
+        match = re.search(r'(\d{4}/\d{1,2}/\d{1,2})', raw_date)
+        if not match:
+            continue
+        date_str = match.group(1)
+
+        try:
+            dt = datetime.strptime(date_str, '%Y/%m/%d')
+        except ValueError:
+            continue
+        jst = pytz.timezone('Asia/Tokyo')
+        dt_jst = jst.localize(dt)
+        timestamp = int(dt_jst.timestamp())
+
+        raw_type = li.get('data-category')
+        post_type = CATEGORY_MAP.get(raw_type)
+
+        headline = li.find('li', class_='news_title').text.strip()
+        detail = li.find('li', class_='news_detail')
+        content = detail.get_text(separator='\n').strip()
+
+        first_a = detail.find('a', href=True)
+        url = first_a['href'] if first_a else None
+
+        images = []
+        for img in detail.find_all('img'):
+            img_url = img.get('src')
+            link = None
+            if img.parent.name == 'a' and img.parent.has_attr('href'):
+                link = img.parent['href']
+            images.append({'image': img_url, 'link': link})
+
+        entry = {
+            'date': date_str,
+            'identifier': "POLARIS_CHORD",
+            'type': post_type,
+            'timestamp': timestamp,
+            'headline': headline,
+            'content': content,
+            'url': url,
+            'images': images,
+            'is_ai_summary': False,
+        }
+        news_list.append(entry)
+    return news_list
author	Pinapelz <yukais@pinapelz.com>	2025-04-28 10:23:29 -0700
committer	Pinapelz <yukais@pinapelz.com>	2025-04-28 10:23:29 -0700
commit	638c964b7dba51b0f86c4d1f562a77e2cdb49437 (patch)
tree	9bedc2b587439051d362089e4ec05939a0ccc32d /bemani
parent	4852c740b0e967429f61228511af18ea25a77c12 (diff)