diff options
| author | Pinapelz <yukais@pinapelz.com> | 2025-04-28 10:23:29 -0700 |
|---|---|---|
| committer | Pinapelz <yukais@pinapelz.com> | 2025-04-28 10:23:29 -0700 |
| commit | 638c964b7dba51b0f86c4d1f562a77e2cdb49437 (patch) | |
| tree | 9bedc2b587439051d362089e4ec05939a0ccc32d /bemani/polaris_chord.py | |
| parent | 4852c740b0e967429f61228511af18ea25a77c12 (diff) | |
add support for scraping polaris chord
Diffstat (limited to 'bemani/polaris_chord.py')
| -rw-r--r-- | bemani/polaris_chord.py | 62 |
1 files changed, 62 insertions, 0 deletions
diff --git a/bemani/polaris_chord.py b/bemani/polaris_chord.py new file mode 100644 index 0000000..2880dae --- /dev/null +++ b/bemani/polaris_chord.py @@ -0,0 +1,62 @@ +from bs4 import BeautifulSoup +from datetime import datetime +import pytz +import re + +CATEGORY_MAP = { + "i_01": "NEWS", + "i_02": "MUSIC", + "i_03": "EVENT", + "i_04": "OTHER" +} + + +def parse_polaris_chord_news_site(html: str) -> list[dict]: + soup = BeautifulSoup(html, 'html.parser') + news_list = [] + for li in soup.select('#info-news li.news'): + raw_date = li.find('li', class_='news_date').text.strip() + match = re.search(r'(\d{4}/\d{1,2}/\d{1,2})', raw_date) + if not match: + continue + date_str = match.group(1) + + try: + dt = datetime.strptime(date_str, '%Y/%m/%d') + except ValueError: + continue + jst = pytz.timezone('Asia/Tokyo') + dt_jst = jst.localize(dt) + timestamp = int(dt_jst.timestamp()) + + raw_type = li.get('data-category') + post_type = CATEGORY_MAP.get(raw_type) + + headline = li.find('li', class_='news_title').text.strip() + detail = li.find('li', class_='news_detail') + content = detail.get_text(separator='\n').strip() + + first_a = detail.find('a', href=True) + url = first_a['href'] if first_a else None + + images = [] + for img in detail.find_all('img'): + img_url = img.get('src') + link = None + if img.parent.name == 'a' and img.parent.has_attr('href'): + link = img.parent['href'] + images.append({'image': img_url, 'link': link}) + + entry = { + 'date': date_str, + 'identifier': "POLARIS_CHORD", + 'type': post_type, + 'timestamp': timestamp, + 'headline': headline, + 'content': content, + 'url': url, + 'images': images, + 'is_ai_summary': False, + } + news_list.append(entry) + return news_list |
