From 03bc3271e0719e33c9517180bb2d39b0d73b7b90 Mon Sep 17 00:00:00 2001 From: Pinapelz Date: Sun, 13 Apr 2025 18:00:11 -0700 Subject: move to generic news_feed module --- .gitignore | 1 + bemani/iidx.py | 1 + bemani/sdvx.py | 1 + constants.py | 9 +++++++++ konami.py | 33 --------------------------------- news_feed.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ scrape.py | 26 ++++++++++++++++++++++++++ 7 files changed, 83 insertions(+), 33 deletions(-) delete mode 100644 konami.py create mode 100644 news_feed.py create mode 100644 scrape.py diff --git a/.gitignore b/.gitignore index 0a19790..828761b 100644 --- a/.gitignore +++ b/.gitignore @@ -172,3 +172,4 @@ cython_debug/ # PyPI configuration file .pypirc +news diff --git a/bemani/iidx.py b/bemani/iidx.py index e20dd7d..978ecbd 100644 --- a/bemani/iidx.py +++ b/bemani/iidx.py @@ -54,6 +54,7 @@ def parse_pinky_crush_news_site(html: str, base_url): "timestamp": timestamp, "headline": headline, "content": content, + "url": None, "images": [], }) diff --git a/bemani/sdvx.py b/bemani/sdvx.py index 55d97ef..50772e8 100644 --- a/bemani/sdvx.py +++ b/bemani/sdvx.py @@ -40,6 +40,7 @@ def parse_exceed_gear_news_site(html: str, base_url: str): 'timestamp': timestamp, 'headline': headline_text, 'content': content, + "url": None, 'images': images }) diff --git a/constants.py b/constants.py index f131a63..5ca4d1e 100644 --- a/constants.py +++ b/constants.py @@ -1,3 +1,12 @@ +from enum import Enum + +DAYS_LIMIT=7 + EAMUSEMENT_BASE_URL = "https://p.eagate.573.jp" SOUND_VOLTEX_EXCEED_GEAR_NEWS_SITE ="https://p.eagate.573.jp/game/sdvx/vi/news/index.html" IIDX_PINKY_CRUSH_NEWS_SITE="https://p.eagate.573.jp/game/2dx/32/info/index.html" + +CHUNITHM_NEWS_SITE="https://info-chunithm.sega.jp/" + +class CHUNITHM_VERSION(Enum): + VERSE = 1 diff --git a/konami.py b/konami.py deleted file mode 100644 index 438b1ed..0000000 --- a/konami.py +++ /dev/null @@ -1,33 +0,0 @@ -""" -Fetching data for Konami/Bemani games -{ - 'date': JST date of news post - 'type': Type of post if available, otherwise if not provided it will be None (aka Generic news) - 'timestamp': Unixtime of date above, - 'headline': Headline, - 'content': All text content of news, - 'images': { - 'image': URL to image, - 'link': If there's an associated href. Else None - - } -} -""" - -from email.utils import parsedate_to_datetime -from site_scraper import SiteScraper -import bemani.sdvx as sound_voltex -import bemani.iidx as iidx -import constants - -def get_news(news_url: str) -> list: - scraper = SiteScraper(headless=True) - site_data = scraper.get_page_source(news_url) - if news_url == constants.SOUND_VOLTEX_EXCEED_GEAR_NEWS_SITE: - news_posts = sorted(sound_voltex.parse_exceed_gear_news_site(site_data, constants.EAMUSEMENT_BASE_URL), key=lambda x: x['timestamp'], reverse=True) - elif news_url == constants.IIDX_PINKY_CRUSH_NEWS_SITE: - news_posts = sorted(iidx.parse_pinky_crush_news_site(site_data, constants.EAMUSEMENT_BASE_URL), key=lambda x: x['timestamp'], reverse=True) - else: - news_posts = [] - scraper.close() - return news_posts diff --git a/news_feed.py b/news_feed.py new file mode 100644 index 0000000..5737cea --- /dev/null +++ b/news_feed.py @@ -0,0 +1,45 @@ +""" +Generic format for a news entry. All keys are considered to be nullable +{ + 'date': JST date of news post + 'type': Type of post if available, otherwise if not provided it will be None (aka Generic news) + 'timestamp': Unixtime of date above, + 'headline': Headline, + 'content': All text content of news, + 'url': URL to full post if available, + 'images': { + 'image': URL to image, + 'link': If there's an associated href. Else None + + } +} +""" + +from email.utils import parsedate_to_datetime +from datetime import datetime +from site_scraper import SiteScraper +import bemani.sdvx as sound_voltex +import bemani.iidx as iidx +import sega.chuni_jp as chunithm_jp +import constants + +def get_news(news_url: str, version=None) -> list: + scraper = SiteScraper(headless=True) + news_json = {} + site_data = scraper.get_page_source(news_url) + if news_url == constants.SOUND_VOLTEX_EXCEED_GEAR_NEWS_SITE: + news_posts = sorted(sound_voltex.parse_exceed_gear_news_site(site_data, constants.EAMUSEMENT_BASE_URL), key=lambda x: x['timestamp'], reverse=True) + elif news_url == constants.IIDX_PINKY_CRUSH_NEWS_SITE: + news_posts = sorted(iidx.parse_pinky_crush_news_site(site_data, constants.EAMUSEMENT_BASE_URL), key=lambda x: x['timestamp'], reverse=True) + elif news_url == constants.CHUNITHM_NEWS_SITE: + if version == constants.CHUNITHM_VERSION.VERSE: + news_posts = sorted(chunithm_jp.parse_chuni_jp_verse_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) + else: + news_posts = [] + scraper.close() + news_json = { + "fetch_date": int(datetime.now().timestamp()), + "posts": news_posts + + } + return news_json diff --git a/scrape.py b/scrape.py new file mode 100644 index 0000000..8d1f467 --- /dev/null +++ b/scrape.py @@ -0,0 +1,26 @@ +""" +Generates news JSON files +""" +import news_feed as feed +import constants +import json +import os + + +OUTPUT_DIR = "news" + +if __name__ == "__main__": + if not os.path.exists(OUTPUT_DIR): + os.makedirs(OUTPUT_DIR) + + iidx_news_data = feed.get_news(constants.IIDX_PINKY_CRUSH_NEWS_SITE) + with open(OUTPUT_DIR+'/iidx_news.json', 'w') as json_file: + json.dump(iidx_news_data, json_file) + + sdvx_news_data = feed.get_news(constants.SOUND_VOLTEX_EXCEED_GEAR_NEWS_SITE) + with open(OUTPUT_DIR+'/sdvx_news.json', 'w') as json_file: + json.dump(sdvx_news_data, json_file) + + chunithm_jp_news_data = feed.get_news(constants.CHUNITHM_NEWS_SITE, constants.CHUNITHM_VERSION.VERSE) + with open(OUTPUT_DIR+'/chunithm_jp_news.json', 'w') as json_file: + json.dump(chunithm_jp_news_data, json_file) -- cgit v1.2.3