aboutsummaryrefslogtreecommitdiffstats
path: root/sega
diff options
context:
space:
mode:
Diffstat (limited to 'sega')
-rw-r--r--sega/chuni_intl.py81
-rw-r--r--sega/chuni_jp.py53
2 files changed, 111 insertions, 23 deletions
diff --git a/sega/chuni_intl.py b/sega/chuni_intl.py
index 9b176dd..88b7695 100644
--- a/sega/chuni_intl.py
+++ b/sega/chuni_intl.py
@@ -1,11 +1,14 @@
-from bs4 import BeautifulSoup
-from datetime import datetime, timezone, timedelta
-from urllib.parse import urljoin
import re
+from datetime import datetime, timedelta, timezone
from enum import Enum
+from urllib.parse import urljoin
+
+from bs4 import BeautifulSoup
+
class ParserVersion(Enum):
- ALPHA=1
+ ALPHA = 1
+
def make_chuni_intl_parser(identifier: str, parser: ParserVersion):
def alpha_parser(html: str):
@@ -37,23 +40,65 @@ def make_chuni_intl_parser(identifier: str, parser: ParserVersion):
dt = datetime(year, month, day, tzinfo=jst)
timestamp = int(dt.timestamp())
- results.append({
- "date": dt.strftime("%Y-%m-%d"),
- "identifier": identifier,
- "type": None,
- "timestamp": timestamp,
- "headline": None,
- "content": headline,
- "url": url,
- "images": [{
- "image": image_url,
- "link": url
- }] if image_url else []
- })
+ results.append(
+ {
+ "date": dt.strftime("%Y-%m-%d"),
+ "identifier": identifier,
+ "type": None,
+ "timestamp": timestamp,
+ "headline": None,
+ "content": headline,
+ "url": url,
+ "images": [{"image": image_url, "link": url}] if image_url else [],
+ }
+ )
return results
if parser == ParserVersion.ALPHA:
return alpha_parser
-parse_chuni_intl_luminous_plus_news_site = make_chuni_intl_parser("CHUNITHM_INTL_LUMINOUS_PLUS", ParserVersion.ALPHA)
+
+def make_image_extractor(version: ParserVersion):
+ """
+ Gets all the images from a full post page as CHUNITHM intl has more relevant images
+ hidden in the actual posts
+ """
+
+ def image_extractor_alpha(html: str):
+ base_url = "https://info-chunithm.sega.com/"
+ soup = BeautifulSoup(html, "html.parser")
+ images = []
+ news_post = soup.select_one(".news--post")
+ if not news_post:
+ return images
+
+ for img in news_post.find_all("img"):
+ src = img.get("src") or img.get("data-src")
+ if not src:
+ continue
+
+ full_url = urljoin(base_url, src)
+ parent = img.find_parent("a")
+ link = parent.get("href") if parent and parent.name == "a" else None
+
+ images.append(
+ {"image": full_url, "link": urljoin(base_url, link) if link else None}
+ )
+
+ return images
+
+ if version == ParserVersion.ALPHA:
+ return image_extractor_alpha
+ else:
+ raise ValueError("Unknown Parser Version")
+
+
+parse_chuni_intl_luminous_plus_news_site = make_chuni_intl_parser(
+ "CHUNITHM_INTL_LUMINOUS_PLUS", ParserVersion.ALPHA
+)
+
+parse_chuni_intl_verse_news_site = make_chuni_intl_parser(
+ "CHUNITHM_INTL_VERSE", ParserVersion.ALPHA
+)
+parse_chuni_intl_verse_post_images = make_image_extractor(ParserVersion.ALPHA)
diff --git a/sega/chuni_jp.py b/sega/chuni_jp.py
index 981fb8f..1feafc1 100644
--- a/sega/chuni_jp.py
+++ b/sega/chuni_jp.py
@@ -1,11 +1,14 @@
-from bs4 import BeautifulSoup
-from datetime import datetime, timezone, timedelta
-from urllib.parse import urljoin
import re
+from datetime import datetime, timedelta, timezone
from enum import Enum
+from urllib.parse import urljoin
+
+from bs4 import BeautifulSoup
+
class ParserVersion(Enum):
- ALPHA=1
+ ALPHA = 1
+
def make_chuni_jp_parser(identifier: str, parser: ParserVersion):
def alpha_parser(html: str):
@@ -64,7 +67,47 @@ def make_chuni_jp_parser(identifier: str, parser: ParserVersion):
news_entries.append(news_dict)
return news_entries
+
if parser == ParserVersion.ALPHA:
return alpha_parser
-parse_chuni_jp_verse_news_site = make_chuni_jp_parser("CHUNITHM_JP_VERSE", ParserVersion.ALPHA)
+
+def make_image_extractor(version: ParserVersion):
+ """
+ Gets all the images from a full post page as CHUNITHM intl has more relevant images
+ hidden in the actual posts
+ """
+
+ def image_extractor_alpha(html: str):
+ base_url = "https://info-chunithm.sega.jp/"
+ soup = BeautifulSoup(html, "html.parser")
+ images = []
+
+ container = soup.select_one(".chuniCommonBox-inner-main")
+ if not container:
+ return images
+ for img in container.find_all("img"):
+ if img.find_parent("p") and "©" in img.find_parent("p").text:
+ continue
+
+ src = img.get("src") or img.get("data-src")
+ if not src:
+ continue
+ full_url = urljoin(base_url, src)
+ parent = img.find_parent("a")
+ link = parent.get("href") if parent and parent.name == "a" else None
+ images.append(
+ {"image": full_url, "link": urljoin(base_url, link) if link else None}
+ )
+ return images
+
+ if version == ParserVersion.ALPHA:
+ return image_extractor_alpha
+ else:
+ raise ValueError("Unknown Parser Version")
+
+
+parse_chuni_jp_verse_news_site = make_chuni_jp_parser(
+ "CHUNITHM_JP_VERSE", ParserVersion.ALPHA
+)
+parse_chuni_jp_verse_post_images = make_image_extractor(ParserVersion.ALPHA)
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage