aboutsummaryrefslogtreecommitdiffstats
path: root/sega/chuni_jp.py
diff options
context:
space:
mode:
Diffstat (limited to 'sega/chuni_jp.py')
-rw-r--r--sega/chuni_jp.py169
1 files changed, 74 insertions, 95 deletions
diff --git a/sega/chuni_jp.py b/sega/chuni_jp.py
index 452e153..a914270 100644
--- a/sega/chuni_jp.py
+++ b/sega/chuni_jp.py
@@ -1,114 +1,93 @@
import re
from datetime import datetime, timedelta, timezone
-from enum import Enum
from urllib.parse import urljoin
from bs4 import BeautifulSoup
-class ParserVersion(Enum):
- ALPHA = 1
-
-
-def make_chuni_jp_parser(identifier: str, parser: ParserVersion):
- def alpha_parser(html: str):
- """
- Confirmed on:
- VERSE
- """
- soup = BeautifulSoup(html, "html.parser")
- news_entries = []
- news_wrapper = soup.find("div", class_="newsMainWrapper-left")
- if not news_wrapper:
- return news_entries
- for a_tag in news_wrapper.find_all("a", href=True):
- if not a_tag.find("div", class_="chuniCommonBox-inner"):
- continue
- news_dict = {}
- news_url = a_tag.get("href")
- news_dict["url"] = news_url
+def parse_chuni_jp_news_site(html: str):
+ """
+ Confirmed on:
+ VERSE
+ """
+ identifier = "CHUNITHM_JP"
+ soup = BeautifulSoup(html, "html.parser")
+ news_entries = []
+ news_wrapper = soup.find("div", class_="newsMainWrapper-left")
+ if not news_wrapper:
+ return news_entries
+ for a_tag in news_wrapper.find_all("a", href=True):
+ if not a_tag.find("div", class_="chuniCommonBox-inner"):
+ continue
+ news_dict = {}
+ news_url = a_tag.get("href")
+ news_dict["url"] = news_url
- date_container = a_tag.find("div", class_="chuniCommonBox-inner-title")
- date_str = None
- if date_container:
- title_span = date_container.find("span", class_="title")
- if title_span:
- text = title_span.get_text(strip=True)
- date_match = re.search(r"(\d{4}\.\d{2}\.\d{2})", text)
- if date_match:
- date_str = date_match.group(1)
- news_dict["date"] = date_str
- news_dict["type"] = None
- timestamp = None
- if date_str:
- try:
- dt = datetime.strptime(date_str, "%Y.%m.%d")
- dt = dt.replace(tzinfo=timezone(timedelta(hours=9)))
- timestamp = int(dt.timestamp())
- except Exception:
- timestamp = None
- news_dict["timestamp"] = timestamp
+ date_container = a_tag.find("div", class_="chuniCommonBox-inner-title")
+ date_str = None
+ if date_container:
+ title_span = date_container.find("span", class_="title")
+ if title_span:
+ text = title_span.get_text(strip=True)
+ date_match = re.search(r"(\d{4}\.\d{2}\.\d{2})", text)
+ if date_match:
+ date_str = date_match.group(1)
+ news_dict["date"] = date_str
+ news_dict["type"] = None
+ timestamp = None
+ if date_str:
+ try:
+ dt = datetime.strptime(date_str, "%Y.%m.%d")
+ dt = dt.replace(tzinfo=timezone(timedelta(hours=9)))
+ timestamp = int(dt.timestamp())
+ except Exception:
+ timestamp = None
+ news_dict["timestamp"] = timestamp
- main_content = a_tag.find("div", class_="chuniCommonBox-inner-main")
- content_text = ""
- if main_content:
- content_text = main_content.get_text(separator=" ", strip=True)
- news_dict["content"] = content_text
+ main_content = a_tag.find("div", class_="chuniCommonBox-inner-main")
+ content_text = ""
+ if main_content:
+ content_text = main_content.get_text(separator=" ", strip=True)
+ news_dict["content"] = content_text
- images = {"image": None, "link": None}
- if main_content:
- img_tag = main_content.find("img")
- if img_tag:
- images["image"] = img_tag.get("src")
- images["link"] = news_url
- news_dict["images"] = [images]
- news_dict["identifier"] = identifier
- news_dict["is_ai_summary"] = False
+ images = {"image": None, "link": None}
+ if main_content:
+ img_tag = main_content.find("img")
+ if img_tag:
+ images["image"] = img_tag.get("src")
+ images["link"] = news_url
+ news_dict["images"] = [images]
+ news_dict["identifier"] = identifier
+ news_dict["is_ai_summary"] = False
- news_entries.append(news_dict)
+ news_entries.append(news_dict)
- return news_entries
+ return news_entries
- if parser == ParserVersion.ALPHA:
- return alpha_parser
-
-def make_image_extractor(version: ParserVersion):
+def parse_chuni_jp_post_images(html: str):
"""
- Gets all the images from a full post page as CHUNITHM intl has more relevant images
- hidden in the actual posts
+ Gets all the images from a full post page as CHUNITHM JP has more relevant images
+ hidden in the actual posts.
"""
+ base_url = "https://info-chunithm.sega.jp/"
+ soup = BeautifulSoup(html, "html.parser")
+ images = []
- def image_extractor_alpha(html: str):
- base_url = "https://info-chunithm.sega.jp/"
- soup = BeautifulSoup(html, "html.parser")
- images = []
-
- container = soup.select_one(".chuniCommonBox-inner-main")
- if not container:
- return images
- for img in container.find_all("img"):
- if img.find_parent("p") and "©" in img.find_parent("p").text:
- continue
-
- src = img.get("src") or img.get("data-src")
- if not src:
- continue
- full_url = urljoin(base_url, src)
- parent = img.find_parent("a")
- link = parent.get("href") if parent and parent.name == "a" else None
- images.append(
- {"image": full_url, "link": urljoin(base_url, link) if link else None}
- )
+ container = soup.select_one(".chuniCommonBox-inner-main")
+ if not container:
return images
+ for img in container.find_all("img"):
+ if img.find_parent("p") and "©" in img.find_parent("p").text:
+ continue
- if version == ParserVersion.ALPHA:
- return image_extractor_alpha
- else:
- raise ValueError("Unknown Parser Version")
-
-
-parse_chuni_jp_news_site = make_chuni_jp_parser(
- "CHUNITHM_JP", ParserVersion.ALPHA
-)
-parse_chuni_jp_post_images = make_image_extractor(ParserVersion.ALPHA)
+ src = img.get("src") or img.get("data-src")
+ if not src:
+ continue
+ full_url = urljoin(base_url, src)
+ parent = img.find_parent("a")
+ link = parent.get("href") if parent and parent.name == "a" else None
+ images.append(
+ {"image": full_url, "link": urljoin(base_url, link) if link else None}
+ )
+ return images
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage