diff options
| author | Pinapelz <yukais@pinapelz.com> | 2025-04-28 10:23:29 -0700 |
|---|---|---|
| committer | Pinapelz <yukais@pinapelz.com> | 2025-04-28 10:23:29 -0700 |
| commit | 638c964b7dba51b0f86c4d1f562a77e2cdb49437 (patch) | |
| tree | 9bedc2b587439051d362089e4ec05939a0ccc32d | |
| parent | 4852c740b0e967429f61228511af18ea25a77c12 (diff) | |
add support for scraping polaris chord
| -rw-r--r-- | bemani/ddr.py | 4 | ||||
| -rw-r--r-- | bemani/polaris_chord.py | 62 | ||||
| -rw-r--r-- | constants.py | 1 | ||||
| -rw-r--r-- | generate.py | 4 | ||||
| -rw-r--r-- | news_feed.py | 7 | ||||
| -rw-r--r-- | requirements.txt | bin | 1384 -> 730 bytes |
6 files changed, 77 insertions, 1 deletions
diff --git a/bemani/ddr.py b/bemani/ddr.py index e9d4584..9651c48 100644 --- a/bemani/ddr.py +++ b/bemani/ddr.py @@ -1,8 +1,10 @@ +""" +Currently unused as e-eamusement app feed is favored. Here for archival purposes +""" from bs4 import BeautifulSoup from datetime import datetime from urllib.parse import urljoin import time -import re def parse_ddr_world_news_site(html: str): base_url = "https://p.eagate.573.jp" diff --git a/bemani/polaris_chord.py b/bemani/polaris_chord.py new file mode 100644 index 0000000..2880dae --- /dev/null +++ b/bemani/polaris_chord.py @@ -0,0 +1,62 @@ +from bs4 import BeautifulSoup +from datetime import datetime +import pytz +import re + +CATEGORY_MAP = { + "i_01": "NEWS", + "i_02": "MUSIC", + "i_03": "EVENT", + "i_04": "OTHER" +} + + +def parse_polaris_chord_news_site(html: str) -> list[dict]: + soup = BeautifulSoup(html, 'html.parser') + news_list = [] + for li in soup.select('#info-news li.news'): + raw_date = li.find('li', class_='news_date').text.strip() + match = re.search(r'(\d{4}/\d{1,2}/\d{1,2})', raw_date) + if not match: + continue + date_str = match.group(1) + + try: + dt = datetime.strptime(date_str, '%Y/%m/%d') + except ValueError: + continue + jst = pytz.timezone('Asia/Tokyo') + dt_jst = jst.localize(dt) + timestamp = int(dt_jst.timestamp()) + + raw_type = li.get('data-category') + post_type = CATEGORY_MAP.get(raw_type) + + headline = li.find('li', class_='news_title').text.strip() + detail = li.find('li', class_='news_detail') + content = detail.get_text(separator='\n').strip() + + first_a = detail.find('a', href=True) + url = first_a['href'] if first_a else None + + images = [] + for img in detail.find_all('img'): + img_url = img.get('src') + link = None + if img.parent.name == 'a' and img.parent.has_attr('href'): + link = img.parent['href'] + images.append({'image': img_url, 'link': link}) + + entry = { + 'date': date_str, + 'identifier': "POLARIS_CHORD", + 'type': post_type, + 'timestamp': timestamp, + 'headline': headline, + 'content': content, + 'url': url, + 'images': images, + 'is_ai_summary': False, + } + news_list.append(entry) + return news_list diff --git a/constants.py b/constants.py index ba725f0..f69a610 100644 --- a/constants.py +++ b/constants.py @@ -5,6 +5,7 @@ DAYS_LIMIT=14 SOUND_VOLTEX_EXCEED_GEAR_NEWS_SITE ="https://p.eagate.573.jp/game/sdvx/vi/news/index.html" IIDX_PINKY_CRUSH_NEWS_SITE="https://p.eagate.573.jp/game/2dx/32/info/index.html" DDR_WORLD_NEWS_SITE="https://p.eagate.573.jp/game/ddr/ddrworld/info/index.html" +POLARIS_CHORD_NEWS_SITE="https://p.eagate.573.jp/game/polarischord/pc/news/index.html" EAMUSE_APP_FEED="https://eam.573.jp/app/web/post/official" IIDX_EAMUSE_APP_ID="s8svjrq62x592gvb" diff --git a/generate.py b/generate.py index a3edbd3..84033b7 100644 --- a/generate.py +++ b/generate.py @@ -86,6 +86,9 @@ def generate_ddr_news_file(eamuse_feed: bool=False): else: return generate_news_file("ddr_news", constants.DDR_WORLD_NEWS_SITE) +def generate_polaris_chord_news_file(): + return generate_news_file("polaris_chord_news", constants.POLARIS_CHORD_NEWS_SITE) + def generate_popn_music_news_file(): return generate_news_file("popn_music_news", constants.EAMUSE_APP_FEED, constants.POPN_MUSIC_EAMUSE_APP_ID) @@ -136,6 +139,7 @@ if __name__ == "__main__": iidx_news_data = generate_iidx_news_file(eamuse_feed=True) sdvx_news_data = generate_sdvx_news_file() ddr_news_data = generate_ddr_news_file(eamuse_feed=True) + polaris_news_data = generate_polaris_chord_news_file() gitadora_news_data = generate_gitadora_news_file() popn_music_news_data = generate_popn_music_news_file() jubeat_news_data = generate_jubeat_news_file() diff --git a/news_feed.py b/news_feed.py index d78c78c..4896322 100644 --- a/news_feed.py +++ b/news_feed.py @@ -25,6 +25,7 @@ import bemani.sdvx as sound_voltex import bemani.iidx as iidx import bemani.ddr as ddr import sega.chuni_jp as chunithm_jp +import bemani.polaris_chord as polaris_chord import sega.chuni_intl as chuni_intl import sega.maimaidx_jp as maimaidx_jp import sega.maimaidx_intl as maimaidx_intl @@ -60,6 +61,12 @@ def get_news(news_url: str, version=None) -> list: news_posts = sorted(iidx.parse_pinky_crush_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) news_posts = translate.add_translate_text_to_en(news_posts, iidx.KEY_TERMS_TL) + elif news_url == constants.POLARIS_CHORD_NEWS_SITE: + scraper = SiteScraper(headless=True) + site_data = scraper.get_page_source(news_url) + news_posts = sorted(polaris_chord.parse_polaris_chord_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) + news_posts = translate.add_translate_text_to_en(news_posts, iidx.KEY_TERMS_TL) + elif news_url == constants.EAMUSE_APP_FEED: scraper = SiteScraper(headless=True) site_data = scraper.get_page_source(news_url+"/?uuid_to="+version) diff --git a/requirements.txt b/requirements.txt Binary files differindex 7cbd3b8..c310de5 100644 --- a/requirements.txt +++ b/requirements.txt |
