aboutsummaryrefslogtreecommitdiffstats
path: root/bemani
diff options
context:
space:
mode:
authorPinapelz <yukais@pinapelz.com>2025-04-28 10:23:29 -0700
committerPinapelz <yukais@pinapelz.com>2025-04-28 10:23:29 -0700
commit638c964b7dba51b0f86c4d1f562a77e2cdb49437 (patch)
tree9bedc2b587439051d362089e4ec05939a0ccc32d /bemani
parent4852c740b0e967429f61228511af18ea25a77c12 (diff)
add support for scraping polaris chord
Diffstat (limited to 'bemani')
-rw-r--r--bemani/ddr.py4
-rw-r--r--bemani/polaris_chord.py62
2 files changed, 65 insertions, 1 deletions
diff --git a/bemani/ddr.py b/bemani/ddr.py
index e9d4584..9651c48 100644
--- a/bemani/ddr.py
+++ b/bemani/ddr.py
@@ -1,8 +1,10 @@
+"""
+Currently unused as e-eamusement app feed is favored. Here for archival purposes
+"""
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import urljoin
import time
-import re
def parse_ddr_world_news_site(html: str):
base_url = "https://p.eagate.573.jp"
diff --git a/bemani/polaris_chord.py b/bemani/polaris_chord.py
new file mode 100644
index 0000000..2880dae
--- /dev/null
+++ b/bemani/polaris_chord.py
@@ -0,0 +1,62 @@
+from bs4 import BeautifulSoup
+from datetime import datetime
+import pytz
+import re
+
+CATEGORY_MAP = {
+ "i_01": "NEWS",
+ "i_02": "MUSIC",
+ "i_03": "EVENT",
+ "i_04": "OTHER"
+}
+
+
+def parse_polaris_chord_news_site(html: str) -> list[dict]:
+ soup = BeautifulSoup(html, 'html.parser')
+ news_list = []
+ for li in soup.select('#info-news li.news'):
+ raw_date = li.find('li', class_='news_date').text.strip()
+ match = re.search(r'(\d{4}/\d{1,2}/\d{1,2})', raw_date)
+ if not match:
+ continue
+ date_str = match.group(1)
+
+ try:
+ dt = datetime.strptime(date_str, '%Y/%m/%d')
+ except ValueError:
+ continue
+ jst = pytz.timezone('Asia/Tokyo')
+ dt_jst = jst.localize(dt)
+ timestamp = int(dt_jst.timestamp())
+
+ raw_type = li.get('data-category')
+ post_type = CATEGORY_MAP.get(raw_type)
+
+ headline = li.find('li', class_='news_title').text.strip()
+ detail = li.find('li', class_='news_detail')
+ content = detail.get_text(separator='\n').strip()
+
+ first_a = detail.find('a', href=True)
+ url = first_a['href'] if first_a else None
+
+ images = []
+ for img in detail.find_all('img'):
+ img_url = img.get('src')
+ link = None
+ if img.parent.name == 'a' and img.parent.has_attr('href'):
+ link = img.parent['href']
+ images.append({'image': img_url, 'link': link})
+
+ entry = {
+ 'date': date_str,
+ 'identifier': "POLARIS_CHORD",
+ 'type': post_type,
+ 'timestamp': timestamp,
+ 'headline': headline,
+ 'content': content,
+ 'url': url,
+ 'images': images,
+ 'is_ai_summary': False,
+ }
+ news_list.append(entry)
+ return news_list
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage