sega/chuni_intl.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112

import re
from datetime import datetime, timedelta, timezone
import json
from urllib.parse import urljoin

from bs4 import BeautifulSoup


def parse_chuni_intl_api_route(raw_api_data: str, identifier: str, limit: int):
    route_data = json.loads(raw_api_data)
    route_data = route_data[:limit]
    entries = []
    for post_data in route_data:
        date_str = post_data["date"]
        dt = datetime.strptime(date_str, "%Y.%m.%d").replace(tzinfo=timezone(timedelta(hours=9)))
        timestamp = int(dt.timestamp())
        full_image_url = post_data["thumbnail"]
        content = post_data["desc"]
        # headline = post_data["title"] kinda useless cause its always just the same as content
        url = post_data["permalink"]
        images = [{
            "image": full_image_url,
            "link": None
        }]
        entry = {
            "date": date_str,
            "identifier": identifier,
            "type": None,
            "timestamp": timestamp,
            "headline": None,
            "content": content,
            "url": url,
            "images": images,
            "is_ai_summary": False
        }
        entries.append(entry)
    return entries


def parse_chuni_intl_post_images(html: str):
    """
    Gets all the images from a full post page as CHUNITHM intl has more relevant images
    hidden in the actual posts.
    """
    base_url = "https://info-chunithm.sega.com/"
    soup = BeautifulSoup(html, "html.parser")
    images = []
    news_post = soup.select_one(".news--post")
    if not news_post:
        return images

    for img in news_post.find_all("img"):
        src = img.get("src") or img.get("data-src")
        if not src:
            continue

        full_url = urljoin(base_url, src)
        parent = img.find_parent("a")
        link = parent.get("href") if parent and parent.name == "a" else None

        images.append(
            {"image": full_url, "link": urljoin(base_url, link) if link else None}
        )

    return images


def parse_chuni_intl_news_site(html: str):
    """
    Confirmed on:
    LUMINOUS PLUS
    """
    identifier = "CHUNITHM_INTL"
    soup = BeautifulSoup(html, "html.parser")
    base_url = "https://info-chunithm.sega.com/"
    items = soup.select("li.news--list__item")
    results = []

    for item in items:
        a_tag = item.select_one("a.news--list__post")
        if not a_tag:
            continue

        url = urljoin(base_url, a_tag["href"])
        date_text = item.select_one("div.news--date").text.strip()
        headline = item.select_one("p.news--title").text.strip()
        img_tag = item.select_one("div.news--thumbnail img")
        image_url = urljoin(base_url, img_tag["src"]) if img_tag else None

        date_match = re.match(r"(\d{4})\.(\d{1,2})\.(\d{1,2})", date_text)
        if not date_match:
            continue
        year, month, day = map(int, date_match.groups())
        jst = timezone(timedelta(hours=9))
        dt = datetime(year, month, day, tzinfo=jst)
        timestamp = int(dt.timestamp())

        results.append(
            {
                "date": dt.strftime("%Y-%m-%d"),
                "identifier": identifier,
                "type": None,
                "timestamp": timestamp,
                "headline": None,
                "content": headline,
                "url": url,
                "images": [{"image": image_url, "link": url}] if image_url else [],
                "is_ai_summary": False,
            }
        )

    return results