From fed975d2b2f8c1763f268c7e668c1d4b0cfabd92 Mon Sep 17 00:00:00 2001 From: Pinapelz Date: Sun, 13 Apr 2025 18:36:11 -0700 Subject: feat: merged news feed --- bemani/iidx.py | 1 + bemani/sdvx.py | 1 + generate.py | 47 +++++++++++++++++++++++++++++++++++++++++++++++ news_feed.py | 12 +++--------- scrape.py | 26 -------------------------- sega/chuni_jp.py | 1 + 6 files changed, 53 insertions(+), 35 deletions(-) create mode 100644 generate.py delete mode 100644 scrape.py diff --git a/bemani/iidx.py b/bemani/iidx.py index 978ecbd..0d97e91 100644 --- a/bemani/iidx.py +++ b/bemani/iidx.py @@ -50,6 +50,7 @@ def parse_pinky_crush_news_site(html: str, base_url): content = re.sub(r'\s*/\s*', '/', content) news_items.append({ "date": date_str, + "identifier": "IIDX_PINKY_CRUSH", "type": type_map[type_class], "timestamp": timestamp, "headline": headline, diff --git a/bemani/sdvx.py b/bemani/sdvx.py index 50772e8..83d0d7c 100644 --- a/bemani/sdvx.py +++ b/bemani/sdvx.py @@ -36,6 +36,7 @@ def parse_exceed_gear_news_site(html: str, base_url: str): entries.append({ 'date': date_str, + 'identifier': 'SOUND_VOLTEX_EXCEED_GEAR', 'type': None, 'timestamp': timestamp, 'headline': headline_text, diff --git a/generate.py b/generate.py new file mode 100644 index 0000000..209e924 --- /dev/null +++ b/generate.py @@ -0,0 +1,47 @@ +""" +Generates news JSON files +""" +import news_feed as feed +import constants +import json +import os + +from datetime import datetime, timedelta + + +OUTPUT_DIR = "news" + +def create_merged_feed(*news_lists): + merged_feed = [] + for news_list in news_lists: + merged_feed.extend(news_list) + cutoff_date = datetime.now() - timedelta(days=constants.DAYS_LIMIT) + filtered_feed = [news for news in merged_feed if datetime.fromtimestamp(news['timestamp']) >= cutoff_date] + sorted_feed = sorted(filtered_feed, key=lambda x: x['timestamp'], reverse=True) + return sorted_feed + +def attach_news_meta_data(news_data: list): + return { + "fetch_time": int(datetime.now().timestamp()), + "news_posts": news_data + } + +if __name__ == "__main__": + if not os.path.exists(OUTPUT_DIR): + os.makedirs(OUTPUT_DIR) + + iidx_news_data = feed.get_news(constants.IIDX_PINKY_CRUSH_NEWS_SITE) + with open(OUTPUT_DIR+'/iidx_news.json', 'w') as json_file: + json.dump(attach_news_meta_data(iidx_news_data), json_file) + + sdvx_news_data = feed.get_news(constants.SOUND_VOLTEX_EXCEED_GEAR_NEWS_SITE) + with open(OUTPUT_DIR+'/sdvx_news.json', 'w') as json_file: + json.dump(attach_news_meta_data(sdvx_news_data), json_file) + + chunithm_jp_news_data = feed.get_news(constants.CHUNITHM_NEWS_SITE, constants.CHUNITHM_VERSION.VERSE) + with open(OUTPUT_DIR+'/chunithm_jp_news.json', 'w') as json_file: + json.dump(attach_news_meta_data(chunithm_jp_news_data), json_file) + + news = create_merged_feed(iidx_news_data, sdvx_news_data, chunithm_jp_news_data) + with open(OUTPUT_DIR+'/news.json', 'w') as json_file: + json.dump(attach_news_meta_data(news), json_file) diff --git a/news_feed.py b/news_feed.py index 5737cea..1a04e6c 100644 --- a/news_feed.py +++ b/news_feed.py @@ -1,7 +1,8 @@ """ Generic format for a news entry. All keys are considered to be nullable { - 'date': JST date of news post + 'date': JST date of news post, + 'identifier': unique identifier for the game (usually some deriv. of the title), 'type': Type of post if available, otherwise if not provided it will be None (aka Generic news) 'timestamp': Unixtime of date above, 'headline': Headline, @@ -16,7 +17,6 @@ Generic format for a news entry. All keys are considered to be nullable """ from email.utils import parsedate_to_datetime -from datetime import datetime from site_scraper import SiteScraper import bemani.sdvx as sound_voltex import bemani.iidx as iidx @@ -25,7 +25,6 @@ import constants def get_news(news_url: str, version=None) -> list: scraper = SiteScraper(headless=True) - news_json = {} site_data = scraper.get_page_source(news_url) if news_url == constants.SOUND_VOLTEX_EXCEED_GEAR_NEWS_SITE: news_posts = sorted(sound_voltex.parse_exceed_gear_news_site(site_data, constants.EAMUSEMENT_BASE_URL), key=lambda x: x['timestamp'], reverse=True) @@ -37,9 +36,4 @@ def get_news(news_url: str, version=None) -> list: else: news_posts = [] scraper.close() - news_json = { - "fetch_date": int(datetime.now().timestamp()), - "posts": news_posts - - } - return news_json + return news_posts diff --git a/scrape.py b/scrape.py deleted file mode 100644 index 8d1f467..0000000 --- a/scrape.py +++ /dev/null @@ -1,26 +0,0 @@ -""" -Generates news JSON files -""" -import news_feed as feed -import constants -import json -import os - - -OUTPUT_DIR = "news" - -if __name__ == "__main__": - if not os.path.exists(OUTPUT_DIR): - os.makedirs(OUTPUT_DIR) - - iidx_news_data = feed.get_news(constants.IIDX_PINKY_CRUSH_NEWS_SITE) - with open(OUTPUT_DIR+'/iidx_news.json', 'w') as json_file: - json.dump(iidx_news_data, json_file) - - sdvx_news_data = feed.get_news(constants.SOUND_VOLTEX_EXCEED_GEAR_NEWS_SITE) - with open(OUTPUT_DIR+'/sdvx_news.json', 'w') as json_file: - json.dump(sdvx_news_data, json_file) - - chunithm_jp_news_data = feed.get_news(constants.CHUNITHM_NEWS_SITE, constants.CHUNITHM_VERSION.VERSE) - with open(OUTPUT_DIR+'/chunithm_jp_news.json', 'w') as json_file: - json.dump(chunithm_jp_news_data, json_file) diff --git a/sega/chuni_jp.py b/sega/chuni_jp.py index df727d4..a45872d 100644 --- a/sega/chuni_jp.py +++ b/sega/chuni_jp.py @@ -54,6 +54,7 @@ def parse_chuni_jp_verse_news_site(html: str): images["image"] = img_tag.get("src") images["link"] = news_url news_dict["images"] = images + news_dict["identifier"] = "CHUNITHM_JP_VERSE" news_entries.append(news_dict) -- cgit v1.2.3