diff options
Diffstat (limited to 'bemani')
| -rw-r--r-- | bemani/ddr.py | 4 | ||||
| -rw-r--r-- | bemani/polaris_chord.py | 62 |
2 files changed, 65 insertions, 1 deletions
diff --git a/bemani/ddr.py b/bemani/ddr.py index e9d4584..9651c48 100644 --- a/bemani/ddr.py +++ b/bemani/ddr.py @@ -1,8 +1,10 @@ +""" +Currently unused as e-eamusement app feed is favored. Here for archival purposes +""" from bs4 import BeautifulSoup from datetime import datetime from urllib.parse import urljoin import time -import re def parse_ddr_world_news_site(html: str): base_url = "https://p.eagate.573.jp" diff --git a/bemani/polaris_chord.py b/bemani/polaris_chord.py new file mode 100644 index 0000000..2880dae --- /dev/null +++ b/bemani/polaris_chord.py @@ -0,0 +1,62 @@ +from bs4 import BeautifulSoup +from datetime import datetime +import pytz +import re + +CATEGORY_MAP = { + "i_01": "NEWS", + "i_02": "MUSIC", + "i_03": "EVENT", + "i_04": "OTHER" +} + + +def parse_polaris_chord_news_site(html: str) -> list[dict]: + soup = BeautifulSoup(html, 'html.parser') + news_list = [] + for li in soup.select('#info-news li.news'): + raw_date = li.find('li', class_='news_date').text.strip() + match = re.search(r'(\d{4}/\d{1,2}/\d{1,2})', raw_date) + if not match: + continue + date_str = match.group(1) + + try: + dt = datetime.strptime(date_str, '%Y/%m/%d') + except ValueError: + continue + jst = pytz.timezone('Asia/Tokyo') + dt_jst = jst.localize(dt) + timestamp = int(dt_jst.timestamp()) + + raw_type = li.get('data-category') + post_type = CATEGORY_MAP.get(raw_type) + + headline = li.find('li', class_='news_title').text.strip() + detail = li.find('li', class_='news_detail') + content = detail.get_text(separator='\n').strip() + + first_a = detail.find('a', href=True) + url = first_a['href'] if first_a else None + + images = [] + for img in detail.find_all('img'): + img_url = img.get('src') + link = None + if img.parent.name == 'a' and img.parent.has_attr('href'): + link = img.parent['href'] + images.append({'image': img_url, 'link': link}) + + entry = { + 'date': date_str, + 'identifier': "POLARIS_CHORD", + 'type': post_type, + 'timestamp': timestamp, + 'headline': headline, + 'content': content, + 'url': url, + 'images': images, + 'is_ai_summary': False, + } + news_list.append(entry) + return news_list |
