diff options
| -rw-r--r-- | constants.py | 1 | ||||
| -rw-r--r-- | generate.py | 3 | ||||
| -rw-r--r-- | news_feed.py | 17 | ||||
| -rw-r--r-- | sega/chuni_intl.py | 81 | ||||
| -rw-r--r-- | sega/chuni_jp.py | 53 |
5 files changed, 130 insertions, 25 deletions
diff --git a/constants.py b/constants.py index 9b0d3c7..81967d5 100644 --- a/constants.py +++ b/constants.py @@ -26,6 +26,7 @@ MUSIC_DIVER_NEWS="https://mypage.musicdiver.jp/api/news?lang=en" TAIKO_BLOG_SITE="https://taiko-ch.net/blog/" ADD_EN_TRANSLATION=True # Only takes effect if an API key is provided in .env +CHUNI_RECURSIVE_IMAGE=True # Scrape the individual post pages and get all images there class CHUNITHM_VERSION(Enum): LUMINOUS_PLUS = 1 diff --git a/generate.py b/generate.py index 90d40cc..0e8070c 100644 --- a/generate.py +++ b/generate.py @@ -8,7 +8,6 @@ import news_feed as feed import constants import json import os -import argparse from datetime import datetime, timedelta @@ -102,7 +101,7 @@ def generate_maimaidx_intl_news_file(): return generate_news_file("maimaidx_intl_news", constants.MAIMAIDX_INTL_NEWS_SITE, constants.MAIMAIDX_VERSION.PRISM) def generate_chunithm_intl_news_file(): - return generate_news_file("chunithm_intl_news", constants.CHUNITHM_INTL_NEWS_SITE, constants.CHUNITHM_VERSION.LUMINOUS_PLUS) + return generate_news_file("chunithm_intl_news", constants.CHUNITHM_INTL_NEWS_SITE, constants.CHUNITHM_VERSION.VERSE) def generate_music_diver_news_file(): return generate_news_file("music_diver_news", constants.MUSIC_DIVER_NEWS) diff --git a/news_feed.py b/news_feed.py index 62af645..c9f5131 100644 --- a/news_feed.py +++ b/news_feed.py @@ -85,11 +85,28 @@ def get_news(news_url: str, version=None) -> list: if version == constants.CHUNITHM_VERSION.VERSE: news_posts = sorted(chunithm_jp.parse_chuni_jp_verse_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) news_posts = translate.add_translate_text_to_en(news_posts) + if constants.CHUNI_RECURSIVE_IMAGE: + for i in range(len(news_posts)): + if not news_posts[i]["url"]: + continue + post_site_data = download_site_as_html(news_posts[i]["url"]) + post_images = chunithm_jp.parse_chuni_jp_verse_post_images(post_site_data) + news_posts[i]["images"].extend([image for image in post_images if not any(existing_image['image'] == image['image'] for existing_image in news_posts[i]["images"])]) elif news_url == constants.CHUNITHM_INTL_NEWS_SITE: site_data = download_site_as_html(news_url) if version == constants.CHUNITHM_VERSION.LUMINOUS_PLUS: news_posts = sorted(chuni_intl.parse_chuni_intl_luminous_plus_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) + elif version == constants.CHUNITHM_VERSION.VERSE: + news_posts = sorted(chuni_intl.parse_chuni_intl_verse_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) + if constants.CHUNI_RECURSIVE_IMAGE: + for i in range(len(news_posts)): + if not news_posts[i]["url"]: + continue + post_site_data = download_site_as_html(news_posts[i]["url"]) + post_images = chuni_intl.parse_chuni_intl_verse_post_images(post_site_data) + news_posts[i]["images"].extend([image for image in post_images if not any(existing_image['image'] == image['image'] for existing_image in news_posts[i]["images"])]) + elif news_url == constants.MAIMAIDX_JP_NEWS_SITE: site_data = download_site_as_html(news_url) diff --git a/sega/chuni_intl.py b/sega/chuni_intl.py index 9b176dd..88b7695 100644 --- a/sega/chuni_intl.py +++ b/sega/chuni_intl.py @@ -1,11 +1,14 @@ -from bs4 import BeautifulSoup -from datetime import datetime, timezone, timedelta -from urllib.parse import urljoin import re +from datetime import datetime, timedelta, timezone from enum import Enum +from urllib.parse import urljoin + +from bs4 import BeautifulSoup + class ParserVersion(Enum): - ALPHA=1 + ALPHA = 1 + def make_chuni_intl_parser(identifier: str, parser: ParserVersion): def alpha_parser(html: str): @@ -37,23 +40,65 @@ def make_chuni_intl_parser(identifier: str, parser: ParserVersion): dt = datetime(year, month, day, tzinfo=jst) timestamp = int(dt.timestamp()) - results.append({ - "date": dt.strftime("%Y-%m-%d"), - "identifier": identifier, - "type": None, - "timestamp": timestamp, - "headline": None, - "content": headline, - "url": url, - "images": [{ - "image": image_url, - "link": url - }] if image_url else [] - }) + results.append( + { + "date": dt.strftime("%Y-%m-%d"), + "identifier": identifier, + "type": None, + "timestamp": timestamp, + "headline": None, + "content": headline, + "url": url, + "images": [{"image": image_url, "link": url}] if image_url else [], + } + ) return results if parser == ParserVersion.ALPHA: return alpha_parser -parse_chuni_intl_luminous_plus_news_site = make_chuni_intl_parser("CHUNITHM_INTL_LUMINOUS_PLUS", ParserVersion.ALPHA) + +def make_image_extractor(version: ParserVersion): + """ + Gets all the images from a full post page as CHUNITHM intl has more relevant images + hidden in the actual posts + """ + + def image_extractor_alpha(html: str): + base_url = "https://info-chunithm.sega.com/" + soup = BeautifulSoup(html, "html.parser") + images = [] + news_post = soup.select_one(".news--post") + if not news_post: + return images + + for img in news_post.find_all("img"): + src = img.get("src") or img.get("data-src") + if not src: + continue + + full_url = urljoin(base_url, src) + parent = img.find_parent("a") + link = parent.get("href") if parent and parent.name == "a" else None + + images.append( + {"image": full_url, "link": urljoin(base_url, link) if link else None} + ) + + return images + + if version == ParserVersion.ALPHA: + return image_extractor_alpha + else: + raise ValueError("Unknown Parser Version") + + +parse_chuni_intl_luminous_plus_news_site = make_chuni_intl_parser( + "CHUNITHM_INTL_LUMINOUS_PLUS", ParserVersion.ALPHA +) + +parse_chuni_intl_verse_news_site = make_chuni_intl_parser( + "CHUNITHM_INTL_VERSE", ParserVersion.ALPHA +) +parse_chuni_intl_verse_post_images = make_image_extractor(ParserVersion.ALPHA) diff --git a/sega/chuni_jp.py b/sega/chuni_jp.py index 981fb8f..1feafc1 100644 --- a/sega/chuni_jp.py +++ b/sega/chuni_jp.py @@ -1,11 +1,14 @@ -from bs4 import BeautifulSoup -from datetime import datetime, timezone, timedelta -from urllib.parse import urljoin import re +from datetime import datetime, timedelta, timezone from enum import Enum +from urllib.parse import urljoin + +from bs4 import BeautifulSoup + class ParserVersion(Enum): - ALPHA=1 + ALPHA = 1 + def make_chuni_jp_parser(identifier: str, parser: ParserVersion): def alpha_parser(html: str): @@ -64,7 +67,47 @@ def make_chuni_jp_parser(identifier: str, parser: ParserVersion): news_entries.append(news_dict) return news_entries + if parser == ParserVersion.ALPHA: return alpha_parser -parse_chuni_jp_verse_news_site = make_chuni_jp_parser("CHUNITHM_JP_VERSE", ParserVersion.ALPHA) + +def make_image_extractor(version: ParserVersion): + """ + Gets all the images from a full post page as CHUNITHM intl has more relevant images + hidden in the actual posts + """ + + def image_extractor_alpha(html: str): + base_url = "https://info-chunithm.sega.jp/" + soup = BeautifulSoup(html, "html.parser") + images = [] + + container = soup.select_one(".chuniCommonBox-inner-main") + if not container: + return images + for img in container.find_all("img"): + if img.find_parent("p") and "©" in img.find_parent("p").text: + continue + + src = img.get("src") or img.get("data-src") + if not src: + continue + full_url = urljoin(base_url, src) + parent = img.find_parent("a") + link = parent.get("href") if parent and parent.name == "a" else None + images.append( + {"image": full_url, "link": urljoin(base_url, link) if link else None} + ) + return images + + if version == ParserVersion.ALPHA: + return image_extractor_alpha + else: + raise ValueError("Unknown Parser Version") + + +parse_chuni_jp_verse_news_site = make_chuni_jp_parser( + "CHUNITHM_JP_VERSE", ParserVersion.ALPHA +) +parse_chuni_jp_verse_post_images = make_image_extractor(ParserVersion.ALPHA) |
