aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPinapelz <yukais@pinapelz.com>2025-04-21 14:04:07 -0700
committerPinapelz <yukais@pinapelz.com>2025-04-21 14:04:07 -0700
commitd88ea267b780aebc077b43dcf56a0141c3abe6d4 (patch)
treee65f915711e95a9224c3cc095c95d22ebe108951
parent0e0dc8c6615c648f55727ce3b2c96560864b5b0f (diff)
chuni: append images from detailed view of posts for more images
-rw-r--r--constants.py1
-rw-r--r--generate.py3
-rw-r--r--news_feed.py17
-rw-r--r--sega/chuni_intl.py81
-rw-r--r--sega/chuni_jp.py53
5 files changed, 130 insertions, 25 deletions
diff --git a/constants.py b/constants.py
index 9b0d3c7..81967d5 100644
--- a/constants.py
+++ b/constants.py
@@ -26,6 +26,7 @@ MUSIC_DIVER_NEWS="https://mypage.musicdiver.jp/api/news?lang=en"
TAIKO_BLOG_SITE="https://taiko-ch.net/blog/"
ADD_EN_TRANSLATION=True # Only takes effect if an API key is provided in .env
+CHUNI_RECURSIVE_IMAGE=True # Scrape the individual post pages and get all images there
class CHUNITHM_VERSION(Enum):
LUMINOUS_PLUS = 1
diff --git a/generate.py b/generate.py
index 90d40cc..0e8070c 100644
--- a/generate.py
+++ b/generate.py
@@ -8,7 +8,6 @@ import news_feed as feed
import constants
import json
import os
-import argparse
from datetime import datetime, timedelta
@@ -102,7 +101,7 @@ def generate_maimaidx_intl_news_file():
return generate_news_file("maimaidx_intl_news", constants.MAIMAIDX_INTL_NEWS_SITE, constants.MAIMAIDX_VERSION.PRISM)
def generate_chunithm_intl_news_file():
- return generate_news_file("chunithm_intl_news", constants.CHUNITHM_INTL_NEWS_SITE, constants.CHUNITHM_VERSION.LUMINOUS_PLUS)
+ return generate_news_file("chunithm_intl_news", constants.CHUNITHM_INTL_NEWS_SITE, constants.CHUNITHM_VERSION.VERSE)
def generate_music_diver_news_file():
return generate_news_file("music_diver_news", constants.MUSIC_DIVER_NEWS)
diff --git a/news_feed.py b/news_feed.py
index 62af645..c9f5131 100644
--- a/news_feed.py
+++ b/news_feed.py
@@ -85,11 +85,28 @@ def get_news(news_url: str, version=None) -> list:
if version == constants.CHUNITHM_VERSION.VERSE:
news_posts = sorted(chunithm_jp.parse_chuni_jp_verse_news_site(site_data), key=lambda x: x['timestamp'], reverse=True)
news_posts = translate.add_translate_text_to_en(news_posts)
+ if constants.CHUNI_RECURSIVE_IMAGE:
+ for i in range(len(news_posts)):
+ if not news_posts[i]["url"]:
+ continue
+ post_site_data = download_site_as_html(news_posts[i]["url"])
+ post_images = chunithm_jp.parse_chuni_jp_verse_post_images(post_site_data)
+ news_posts[i]["images"].extend([image for image in post_images if not any(existing_image['image'] == image['image'] for existing_image in news_posts[i]["images"])])
elif news_url == constants.CHUNITHM_INTL_NEWS_SITE:
site_data = download_site_as_html(news_url)
if version == constants.CHUNITHM_VERSION.LUMINOUS_PLUS:
news_posts = sorted(chuni_intl.parse_chuni_intl_luminous_plus_news_site(site_data), key=lambda x: x['timestamp'], reverse=True)
+ elif version == constants.CHUNITHM_VERSION.VERSE:
+ news_posts = sorted(chuni_intl.parse_chuni_intl_verse_news_site(site_data), key=lambda x: x['timestamp'], reverse=True)
+ if constants.CHUNI_RECURSIVE_IMAGE:
+ for i in range(len(news_posts)):
+ if not news_posts[i]["url"]:
+ continue
+ post_site_data = download_site_as_html(news_posts[i]["url"])
+ post_images = chuni_intl.parse_chuni_intl_verse_post_images(post_site_data)
+ news_posts[i]["images"].extend([image for image in post_images if not any(existing_image['image'] == image['image'] for existing_image in news_posts[i]["images"])])
+
elif news_url == constants.MAIMAIDX_JP_NEWS_SITE:
site_data = download_site_as_html(news_url)
diff --git a/sega/chuni_intl.py b/sega/chuni_intl.py
index 9b176dd..88b7695 100644
--- a/sega/chuni_intl.py
+++ b/sega/chuni_intl.py
@@ -1,11 +1,14 @@
-from bs4 import BeautifulSoup
-from datetime import datetime, timezone, timedelta
-from urllib.parse import urljoin
import re
+from datetime import datetime, timedelta, timezone
from enum import Enum
+from urllib.parse import urljoin
+
+from bs4 import BeautifulSoup
+
class ParserVersion(Enum):
- ALPHA=1
+ ALPHA = 1
+
def make_chuni_intl_parser(identifier: str, parser: ParserVersion):
def alpha_parser(html: str):
@@ -37,23 +40,65 @@ def make_chuni_intl_parser(identifier: str, parser: ParserVersion):
dt = datetime(year, month, day, tzinfo=jst)
timestamp = int(dt.timestamp())
- results.append({
- "date": dt.strftime("%Y-%m-%d"),
- "identifier": identifier,
- "type": None,
- "timestamp": timestamp,
- "headline": None,
- "content": headline,
- "url": url,
- "images": [{
- "image": image_url,
- "link": url
- }] if image_url else []
- })
+ results.append(
+ {
+ "date": dt.strftime("%Y-%m-%d"),
+ "identifier": identifier,
+ "type": None,
+ "timestamp": timestamp,
+ "headline": None,
+ "content": headline,
+ "url": url,
+ "images": [{"image": image_url, "link": url}] if image_url else [],
+ }
+ )
return results
if parser == ParserVersion.ALPHA:
return alpha_parser
-parse_chuni_intl_luminous_plus_news_site = make_chuni_intl_parser("CHUNITHM_INTL_LUMINOUS_PLUS", ParserVersion.ALPHA)
+
+def make_image_extractor(version: ParserVersion):
+ """
+ Gets all the images from a full post page as CHUNITHM intl has more relevant images
+ hidden in the actual posts
+ """
+
+ def image_extractor_alpha(html: str):
+ base_url = "https://info-chunithm.sega.com/"
+ soup = BeautifulSoup(html, "html.parser")
+ images = []
+ news_post = soup.select_one(".news--post")
+ if not news_post:
+ return images
+
+ for img in news_post.find_all("img"):
+ src = img.get("src") or img.get("data-src")
+ if not src:
+ continue
+
+ full_url = urljoin(base_url, src)
+ parent = img.find_parent("a")
+ link = parent.get("href") if parent and parent.name == "a" else None
+
+ images.append(
+ {"image": full_url, "link": urljoin(base_url, link) if link else None}
+ )
+
+ return images
+
+ if version == ParserVersion.ALPHA:
+ return image_extractor_alpha
+ else:
+ raise ValueError("Unknown Parser Version")
+
+
+parse_chuni_intl_luminous_plus_news_site = make_chuni_intl_parser(
+ "CHUNITHM_INTL_LUMINOUS_PLUS", ParserVersion.ALPHA
+)
+
+parse_chuni_intl_verse_news_site = make_chuni_intl_parser(
+ "CHUNITHM_INTL_VERSE", ParserVersion.ALPHA
+)
+parse_chuni_intl_verse_post_images = make_image_extractor(ParserVersion.ALPHA)
diff --git a/sega/chuni_jp.py b/sega/chuni_jp.py
index 981fb8f..1feafc1 100644
--- a/sega/chuni_jp.py
+++ b/sega/chuni_jp.py
@@ -1,11 +1,14 @@
-from bs4 import BeautifulSoup
-from datetime import datetime, timezone, timedelta
-from urllib.parse import urljoin
import re
+from datetime import datetime, timedelta, timezone
from enum import Enum
+from urllib.parse import urljoin
+
+from bs4 import BeautifulSoup
+
class ParserVersion(Enum):
- ALPHA=1
+ ALPHA = 1
+
def make_chuni_jp_parser(identifier: str, parser: ParserVersion):
def alpha_parser(html: str):
@@ -64,7 +67,47 @@ def make_chuni_jp_parser(identifier: str, parser: ParserVersion):
news_entries.append(news_dict)
return news_entries
+
if parser == ParserVersion.ALPHA:
return alpha_parser
-parse_chuni_jp_verse_news_site = make_chuni_jp_parser("CHUNITHM_JP_VERSE", ParserVersion.ALPHA)
+
+def make_image_extractor(version: ParserVersion):
+ """
+ Gets all the images from a full post page as CHUNITHM intl has more relevant images
+ hidden in the actual posts
+ """
+
+ def image_extractor_alpha(html: str):
+ base_url = "https://info-chunithm.sega.jp/"
+ soup = BeautifulSoup(html, "html.parser")
+ images = []
+
+ container = soup.select_one(".chuniCommonBox-inner-main")
+ if not container:
+ return images
+ for img in container.find_all("img"):
+ if img.find_parent("p") and "©" in img.find_parent("p").text:
+ continue
+
+ src = img.get("src") or img.get("data-src")
+ if not src:
+ continue
+ full_url = urljoin(base_url, src)
+ parent = img.find_parent("a")
+ link = parent.get("href") if parent and parent.name == "a" else None
+ images.append(
+ {"image": full_url, "link": urljoin(base_url, link) if link else None}
+ )
+ return images
+
+ if version == ParserVersion.ALPHA:
+ return image_extractor_alpha
+ else:
+ raise ValueError("Unknown Parser Version")
+
+
+parse_chuni_jp_verse_news_site = make_chuni_jp_parser(
+ "CHUNITHM_JP_VERSE", ParserVersion.ALPHA
+)
+parse_chuni_jp_verse_post_images = make_image_extractor(ParserVersion.ALPHA)
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage