aboutsummaryrefslogtreecommitdiffstats
path: root/sega/maimaidx_jp.py
diff options
context:
space:
mode:
authorPinapelz <yukais@pinapelz.com>2025-04-14 01:56:18 -0700
committerPinapelz <yukais@pinapelz.com>2025-04-14 01:56:18 -0700
commitdc279404b1f6e371d6d7acd1380a265762e60218 (patch)
tree85444616b0c98697bf060bb932557ae323a0c9ed /sega/maimaidx_jp.py
parent147c36d207ca74e876b6b4703fd3f57f3ab57e56 (diff)
add maimai DX JPN scraping
Diffstat (limited to 'sega/maimaidx_jp.py')
-rw-r--r--sega/maimaidx_jp.py48
1 files changed, 48 insertions, 0 deletions
diff --git a/sega/maimaidx_jp.py b/sega/maimaidx_jp.py
new file mode 100644
index 0000000..5a88ef1
--- /dev/null
+++ b/sega/maimaidx_jp.py
@@ -0,0 +1,48 @@
+from bs4 import BeautifulSoup
+from datetime import datetime, timezone, timedelta
+from urllib.parse import urljoin
+import re
+
+def parse_maimaidx_jp_prism_plus_news_site(html: str):
+ soup = BeautifulSoup(html, "html.parser")
+ base_url = "https://info-maimai.sega.jp/"
+ news_items = []
+
+ news_boxes = soup.select(".maiPager-content .newsBox")
+ for box in news_boxes:
+ a_tag = box.select_one("a")
+ url = urljoin(base_url, a_tag["href"]) if a_tag and a_tag.get("href") else None
+
+ img_tag = box.select_one("img")
+ image_url = urljoin(base_url, img_tag["src"]) if img_tag else None
+
+ date_tag = box.select_one(".newsDate")
+ raw_date = date_tag.get_text(strip=True) if date_tag else None
+
+ jst = timezone(timedelta(hours=9))
+ try:
+ dt = datetime.strptime(raw_date.split(" ")[0], "%Y.%m.%d").replace(tzinfo=jst)
+ timestamp = int(dt.timestamp())
+ except:
+ dt = None
+ timestamp = 0
+
+ headline_tag = box.select_one(".newsLink")
+ headline = headline_tag.get_text(strip=True) if headline_tag else None
+ content = box.get_text(separator="\n", strip=True)
+ identifier = re.sub(r"\W+", "-", headline.lower()) if headline else "unknown"
+ news_items.append({
+ "date": raw_date,
+ "identifier": identifier,
+ "type": None,
+ "timestamp": timestamp,
+ "headline": headline,
+ "content": content,
+ "url": url,
+ "images": [{
+ "image": image_url,
+ "link": url
+ }] if image_url else []
+ })
+
+ return news_items
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage