import re from datetime import datetime, timedelta, timezone import json from urllib.parse import urljoin from bs4 import BeautifulSoup def parse_chuni_intl_api_route(raw_api_data: str, identifier: str, limit: int): route_data = json.loads(raw_api_data) route_data = route_data[:limit] entries = [] for post_data in route_data: date_str = post_data["date"] dt = datetime.strptime(date_str, "%Y.%m.%d").replace(tzinfo=timezone(timedelta(hours=9))) timestamp = int(dt.timestamp()) full_image_url = post_data["thumbnail"] content = post_data["desc"] # headline = post_data["title"] kinda useless cause its always just the same as content url = post_data["permalink"] images = [{ "image": full_image_url, "link": None }] entry = { "date": date_str, "identifier": identifier, "type": None, "timestamp": timestamp, "headline": None, "content": content, "url": url, "images": images, "is_ai_summary": False } entries.append(entry) return entries def parse_chuni_intl_post_images(html: str): """ Gets all the images from a full post page as CHUNITHM intl has more relevant images hidden in the actual posts. """ base_url = "https://info-chunithm.sega.com/" soup = BeautifulSoup(html, "html.parser") images = [] news_post = soup.select_one(".news--post") if not news_post: return images for img in news_post.find_all("img"): src = img.get("src") or img.get("data-src") if not src: continue full_url = urljoin(base_url, src) parent = img.find_parent("a") link = parent.get("href") if parent and parent.name == "a" else None images.append( {"image": full_url, "link": urljoin(base_url, link) if link else None} ) return images def parse_chuni_intl_news_site(html: str): """ Confirmed on: LUMINOUS PLUS """ identifier = "CHUNITHM_INTL" soup = BeautifulSoup(html, "html.parser") base_url = "https://info-chunithm.sega.com/" items = soup.select("li.news--list__item") results = [] for item in items: a_tag = item.select_one("a.news--list__post") if not a_tag: continue url = urljoin(base_url, a_tag["href"]) date_text = item.select_one("div.news--date").text.strip() headline = item.select_one("p.news--title").text.strip() img_tag = item.select_one("div.news--thumbnail img") image_url = urljoin(base_url, img_tag["src"]) if img_tag else None date_match = re.match(r"(\d{4})\.(\d{1,2})\.(\d{1,2})", date_text) if not date_match: continue year, month, day = map(int, date_match.groups()) jst = timezone(timedelta(hours=9)) dt = datetime(year, month, day, tzinfo=jst) timestamp = int(dt.timestamp()) results.append( { "date": dt.strftime("%Y-%m-%d"), "identifier": identifier, "type": None, "timestamp": timestamp, "headline": None, "content": headline, "url": url, "images": [{"image": image_url, "link": url}] if image_url else [], "is_ai_summary": False, } ) return results