aboutsummaryrefslogtreecommitdiffstats
path: root/sega/chuni_intl.py
blob: 64d279cf5b13447ccce6804824fea6b7d2556848 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import re
from datetime import datetime, timedelta, timezone
from enum import Enum
import json
from urllib.parse import urljoin

from bs4 import BeautifulSoup


class ParserVersion(Enum):
    ALPHA = 1


def make_chuni_intl_parser(identifier: str, parser: ParserVersion):
    def alpha_parser(html: str):
        """
        Confirmed on:
        LUMINOUS PLUS
        """
        soup = BeautifulSoup(html, "html.parser")
        base_url = "https://info-chunithm.sega.com/"
        items = soup.select("li.news--list__item")
        results = []

        for item in items:
            a_tag = item.select_one("a.news--list__post")
            if not a_tag:
                continue

            url = urljoin(base_url, a_tag["href"])
            date_text = item.select_one("div.news--date").text.strip()
            headline = item.select_one("p.news--title").text.strip()
            img_tag = item.select_one("div.news--thumbnail img")
            image_url = urljoin(base_url, img_tag["src"]) if img_tag else None

            date_match = re.match(r"(\d{4})\.(\d{1,2})\.(\d{1,2})", date_text)
            if not date_match:
                continue
            year, month, day = map(int, date_match.groups())
            jst = timezone(timedelta(hours=9))
            dt = datetime(year, month, day, tzinfo=jst)
            timestamp = int(dt.timestamp())

            results.append(
                {
                    "date": dt.strftime("%Y-%m-%d"),
                    "identifier": identifier,
                    "type": None,
                    "timestamp": timestamp,
                    "headline": None,
                    "content": headline,
                    "url": url,
                    "images": [{"image": image_url, "link": url}] if image_url else [],
                    'is_ai_summary': False
                }
            )

        return results

    if parser == ParserVersion.ALPHA:
        return alpha_parser


def make_image_extractor(version: ParserVersion):
    """
    Gets all the images from a full post page as CHUNITHM intl has more relevant images
    hidden in the actual posts
    """

    def image_extractor_alpha(html: str):
        base_url = "https://info-chunithm.sega.com/"
        soup = BeautifulSoup(html, "html.parser")
        images = []
        news_post = soup.select_one(".news--post")
        if not news_post:
            return images

        for img in news_post.find_all("img"):
            src = img.get("src") or img.get("data-src")
            if not src:
                continue

            full_url = urljoin(base_url, src)
            parent = img.find_parent("a")
            link = parent.get("href") if parent and parent.name == "a" else None

            images.append(
                {"image": full_url, "link": urljoin(base_url, link) if link else None}
            )

        return images

    if version == ParserVersion.ALPHA:
        return image_extractor_alpha
    else:
        raise ValueError("Unknown Parser Version")

def parse_chuni_intl_api_route(raw_api_data: str, identifier: str, limit: int):
    route_data = json.loads(raw_api_data)
    route_data = route_data[:limit]
    entries = []
    for post_data in route_data:
        date_str = post_data["date"]
        dt = datetime.strptime(date_str, "%Y.%m.%d").replace(tzinfo=timezone(timedelta(hours=9)))
        timestamp = int(dt.timestamp())
        full_image_url = post_data["thumbnail"]
        content = post_data["desc"]
        # headline = post_data["title"] kinda useless cause its always just the same as content
        url = post_data["permalink"]
        images = [{
            "image": full_image_url,
            "link": None
        }]
        entry = {
            "date": date_str,
            "identifier": identifier,
            "type": None,
            "timestamp": timestamp,
            "headline": None,
            "content": content,
            "url": url,
            "images": images,
            "is_ai_summary": False
        }
        entries.append(entry)
    return entries


parse_chuni_intl_news_site = make_chuni_intl_parser(
    "CHUNITHM_INTL", ParserVersion.ALPHA
)
parse_chuni_intl_post_images = make_image_extractor(ParserVersion.ALPHA)
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage