1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
|
import re
from datetime import datetime, timedelta, timezone
from enum import Enum
import json
from urllib.parse import urljoin
from bs4 import BeautifulSoup
class ParserVersion(Enum):
ALPHA = 1
def make_chuni_intl_parser(identifier: str, parser: ParserVersion):
def alpha_parser(html: str):
"""
Confirmed on:
LUMINOUS PLUS
"""
soup = BeautifulSoup(html, "html.parser")
base_url = "https://info-chunithm.sega.com/"
items = soup.select("li.news--list__item")
results = []
for item in items:
a_tag = item.select_one("a.news--list__post")
if not a_tag:
continue
url = urljoin(base_url, a_tag["href"])
date_text = item.select_one("div.news--date").text.strip()
headline = item.select_one("p.news--title").text.strip()
img_tag = item.select_one("div.news--thumbnail img")
image_url = urljoin(base_url, img_tag["src"]) if img_tag else None
date_match = re.match(r"(\d{4})\.(\d{1,2})\.(\d{1,2})", date_text)
if not date_match:
continue
year, month, day = map(int, date_match.groups())
jst = timezone(timedelta(hours=9))
dt = datetime(year, month, day, tzinfo=jst)
timestamp = int(dt.timestamp())
results.append(
{
"date": dt.strftime("%Y-%m-%d"),
"identifier": identifier,
"type": None,
"timestamp": timestamp,
"headline": None,
"content": headline,
"url": url,
"images": [{"image": image_url, "link": url}] if image_url else [],
'is_ai_summary': False
}
)
return results
if parser == ParserVersion.ALPHA:
return alpha_parser
def make_image_extractor(version: ParserVersion):
"""
Gets all the images from a full post page as CHUNITHM intl has more relevant images
hidden in the actual posts
"""
def image_extractor_alpha(html: str):
base_url = "https://info-chunithm.sega.com/"
soup = BeautifulSoup(html, "html.parser")
images = []
news_post = soup.select_one(".news--post")
if not news_post:
return images
for img in news_post.find_all("img"):
src = img.get("src") or img.get("data-src")
if not src:
continue
full_url = urljoin(base_url, src)
parent = img.find_parent("a")
link = parent.get("href") if parent and parent.name == "a" else None
images.append(
{"image": full_url, "link": urljoin(base_url, link) if link else None}
)
return images
if version == ParserVersion.ALPHA:
return image_extractor_alpha
else:
raise ValueError("Unknown Parser Version")
def parse_chuni_intl_api_route(raw_api_data: str, identifier: str, limit: int):
route_data = json.loads(raw_api_data)
route_data = route_data[:limit]
entries = []
for post_data in route_data:
date_str = post_data["date"]
dt = datetime.strptime(date_str, "%Y.%m.%d").replace(tzinfo=timezone(timedelta(hours=9)))
timestamp = int(dt.timestamp())
full_image_url = post_data["thumbnail"]
content = post_data["desc"]
headline = post_data["title"]
url = post_data["permalink"]
images = [{
"image": full_image_url,
"link": None
}]
entry = {
"date": date_str,
"identifier": identifier,
"type": None,
"timestamp": timestamp,
"headline": headline,
"content": content,
"url": url,
"images": images,
"is_ai_summary": False
}
entries.append(entry)
return entries
parse_chuni_intl_news_site = make_chuni_intl_parser(
"CHUNITHM_INTL", ParserVersion.ALPHA
)
parse_chuni_intl_post_images = make_image_extractor(ParserVersion.ALPHA)
|