diff options
| author | Pinapelz <yukais@pinapelz.com> | 2025-04-21 14:04:07 -0700 |
|---|---|---|
| committer | Pinapelz <yukais@pinapelz.com> | 2025-04-21 14:04:07 -0700 |
| commit | d88ea267b780aebc077b43dcf56a0141c3abe6d4 (patch) | |
| tree | e65f915711e95a9224c3cc095c95d22ebe108951 /sega/chuni_jp.py | |
| parent | 0e0dc8c6615c648f55727ce3b2c96560864b5b0f (diff) | |
chuni: append images from detailed view of posts for more images
Diffstat (limited to 'sega/chuni_jp.py')
| -rw-r--r-- | sega/chuni_jp.py | 53 |
1 files changed, 48 insertions, 5 deletions
diff --git a/sega/chuni_jp.py b/sega/chuni_jp.py index 981fb8f..1feafc1 100644 --- a/sega/chuni_jp.py +++ b/sega/chuni_jp.py @@ -1,11 +1,14 @@ -from bs4 import BeautifulSoup -from datetime import datetime, timezone, timedelta -from urllib.parse import urljoin import re +from datetime import datetime, timedelta, timezone from enum import Enum +from urllib.parse import urljoin + +from bs4 import BeautifulSoup + class ParserVersion(Enum): - ALPHA=1 + ALPHA = 1 + def make_chuni_jp_parser(identifier: str, parser: ParserVersion): def alpha_parser(html: str): @@ -64,7 +67,47 @@ def make_chuni_jp_parser(identifier: str, parser: ParserVersion): news_entries.append(news_dict) return news_entries + if parser == ParserVersion.ALPHA: return alpha_parser -parse_chuni_jp_verse_news_site = make_chuni_jp_parser("CHUNITHM_JP_VERSE", ParserVersion.ALPHA) + +def make_image_extractor(version: ParserVersion): + """ + Gets all the images from a full post page as CHUNITHM intl has more relevant images + hidden in the actual posts + """ + + def image_extractor_alpha(html: str): + base_url = "https://info-chunithm.sega.jp/" + soup = BeautifulSoup(html, "html.parser") + images = [] + + container = soup.select_one(".chuniCommonBox-inner-main") + if not container: + return images + for img in container.find_all("img"): + if img.find_parent("p") and "©" in img.find_parent("p").text: + continue + + src = img.get("src") or img.get("data-src") + if not src: + continue + full_url = urljoin(base_url, src) + parent = img.find_parent("a") + link = parent.get("href") if parent and parent.name == "a" else None + images.append( + {"image": full_url, "link": urljoin(base_url, link) if link else None} + ) + return images + + if version == ParserVersion.ALPHA: + return image_extractor_alpha + else: + raise ValueError("Unknown Parser Version") + + +parse_chuni_jp_verse_news_site = make_chuni_jp_parser( + "CHUNITHM_JP_VERSE", ParserVersion.ALPHA +) +parse_chuni_jp_verse_post_images = make_image_extractor(ParserVersion.ALPHA) |
