aboutsummaryrefslogtreecommitdiffstats
path: root/sega
diff options
context:
space:
mode:
Diffstat (limited to 'sega')
-rw-r--r--sega/__init__.py23
-rw-r--r--sega/chuni_intl.py166
-rw-r--r--sega/chuni_jp.py169
-rw-r--r--sega/maimaidx_intl.py49
-rw-r--r--sega/maimaidx_jp.py89
-rw-r--r--sega/ongeki_jp.py102
6 files changed, 257 insertions, 341 deletions
diff --git a/sega/__init__.py b/sega/__init__.py
new file mode 100644
index 0000000..242ab52
--- /dev/null
+++ b/sega/__init__.py
@@ -0,0 +1,23 @@
+from sega.chuni_jp import parse_chuni_jp_news_site, parse_chuni_jp_post_images
+from sega.chuni_intl import (
+ parse_chuni_intl_api_route,
+ parse_chuni_intl_news_site,
+ parse_chuni_intl_post_images,
+)
+from sega.maimaidx_jp import parse_maimaidx_jp_news_site
+from sega.maimaidx_intl import parse_maimaidx_intl_api_route
+from sega.ongeki_jp import parse_ongeki_news_site
+from sega.idac import parse_idac_news_site, get_promo_image
+
+__all__ = [
+ "parse_chuni_jp_news_site",
+ "parse_chuni_jp_post_images",
+ "parse_chuni_intl_api_route",
+ "parse_chuni_intl_news_site",
+ "parse_chuni_intl_post_images",
+ "parse_maimaidx_jp_news_site",
+ "parse_maimaidx_intl_api_route",
+ "parse_ongeki_news_site",
+ "parse_idac_news_site",
+ "get_promo_image",
+] \ No newline at end of file
diff --git a/sega/chuni_intl.py b/sega/chuni_intl.py
index 64d279c..816b857 100644
--- a/sega/chuni_intl.py
+++ b/sega/chuni_intl.py
@@ -1,100 +1,11 @@
import re
from datetime import datetime, timedelta, timezone
-from enum import Enum
import json
from urllib.parse import urljoin
from bs4 import BeautifulSoup
-class ParserVersion(Enum):
- ALPHA = 1
-
-
-def make_chuni_intl_parser(identifier: str, parser: ParserVersion):
- def alpha_parser(html: str):
- """
- Confirmed on:
- LUMINOUS PLUS
- """
- soup = BeautifulSoup(html, "html.parser")
- base_url = "https://info-chunithm.sega.com/"
- items = soup.select("li.news--list__item")
- results = []
-
- for item in items:
- a_tag = item.select_one("a.news--list__post")
- if not a_tag:
- continue
-
- url = urljoin(base_url, a_tag["href"])
- date_text = item.select_one("div.news--date").text.strip()
- headline = item.select_one("p.news--title").text.strip()
- img_tag = item.select_one("div.news--thumbnail img")
- image_url = urljoin(base_url, img_tag["src"]) if img_tag else None
-
- date_match = re.match(r"(\d{4})\.(\d{1,2})\.(\d{1,2})", date_text)
- if not date_match:
- continue
- year, month, day = map(int, date_match.groups())
- jst = timezone(timedelta(hours=9))
- dt = datetime(year, month, day, tzinfo=jst)
- timestamp = int(dt.timestamp())
-
- results.append(
- {
- "date": dt.strftime("%Y-%m-%d"),
- "identifier": identifier,
- "type": None,
- "timestamp": timestamp,
- "headline": None,
- "content": headline,
- "url": url,
- "images": [{"image": image_url, "link": url}] if image_url else [],
- 'is_ai_summary': False
- }
- )
-
- return results
-
- if parser == ParserVersion.ALPHA:
- return alpha_parser
-
-
-def make_image_extractor(version: ParserVersion):
- """
- Gets all the images from a full post page as CHUNITHM intl has more relevant images
- hidden in the actual posts
- """
-
- def image_extractor_alpha(html: str):
- base_url = "https://info-chunithm.sega.com/"
- soup = BeautifulSoup(html, "html.parser")
- images = []
- news_post = soup.select_one(".news--post")
- if not news_post:
- return images
-
- for img in news_post.find_all("img"):
- src = img.get("src") or img.get("data-src")
- if not src:
- continue
-
- full_url = urljoin(base_url, src)
- parent = img.find_parent("a")
- link = parent.get("href") if parent and parent.name == "a" else None
-
- images.append(
- {"image": full_url, "link": urljoin(base_url, link) if link else None}
- )
-
- return images
-
- if version == ParserVersion.ALPHA:
- return image_extractor_alpha
- else:
- raise ValueError("Unknown Parser Version")
-
def parse_chuni_intl_api_route(raw_api_data: str, identifier: str, limit: int):
route_data = json.loads(raw_api_data)
route_data = route_data[:limit]
@@ -126,7 +37,76 @@ def parse_chuni_intl_api_route(raw_api_data: str, identifier: str, limit: int):
return entries
-parse_chuni_intl_news_site = make_chuni_intl_parser(
- "CHUNITHM_INTL", ParserVersion.ALPHA
-)
-parse_chuni_intl_post_images = make_image_extractor(ParserVersion.ALPHA)
+def parse_chuni_intl_post_images(html: str):
+ """
+ Gets all the images from a full post page as CHUNITHM intl has more relevant images
+ hidden in the actual posts.
+ """
+ base_url = "https://info-chunithm.sega.com/"
+ soup = BeautifulSoup(html, "html.parser")
+ images = []
+ news_post = soup.select_one(".news--post")
+ if not news_post:
+ return images
+
+ for img in news_post.find_all("img"):
+ src = img.get("src") or img.get("data-src")
+ if not src:
+ continue
+
+ full_url = urljoin(base_url, src)
+ parent = img.find_parent("a")
+ link = parent.get("href") if parent and parent.name == "a" else None
+
+ images.append(
+ {"image": full_url, "link": urljoin(base_url, link) if link else None}
+ )
+
+ return images
+
+
+def parse_chuni_intl_news_site(html: str):
+ """
+ Confirmed on:
+ LUMINOUS PLUS
+ """
+ identifier = "CHUNITHM_INTL"
+ soup = BeautifulSoup(html, "html.parser")
+ base_url = "https://info-chunithm.sega.com/"
+ items = soup.select("li.news--list__item")
+ results = []
+
+ for item in items:
+ a_tag = item.select_one("a.news--list__post")
+ if not a_tag:
+ continue
+
+ url = urljoin(base_url, a_tag["href"])
+ date_text = item.select_one("div.news--date").text.strip()
+ headline = item.select_one("p.news--title").text.strip()
+ img_tag = item.select_one("div.news--thumbnail img")
+ image_url = urljoin(base_url, img_tag["src"]) if img_tag else None
+
+ date_match = re.match(r"(\d{4})\.(\d{1,2})\.(\d{1,2})", date_text)
+ if not date_match:
+ continue
+ year, month, day = map(int, date_match.groups())
+ jst = timezone(timedelta(hours=9))
+ dt = datetime(year, month, day, tzinfo=jst)
+ timestamp = int(dt.timestamp())
+
+ results.append(
+ {
+ "date": dt.strftime("%Y-%m-%d"),
+ "identifier": identifier,
+ "type": None,
+ "timestamp": timestamp,
+ "headline": None,
+ "content": headline,
+ "url": url,
+ "images": [{"image": image_url, "link": url}] if image_url else [],
+ "is_ai_summary": False,
+ }
+ )
+
+ return results \ No newline at end of file
diff --git a/sega/chuni_jp.py b/sega/chuni_jp.py
index 452e153..a914270 100644
--- a/sega/chuni_jp.py
+++ b/sega/chuni_jp.py
@@ -1,114 +1,93 @@
import re
from datetime import datetime, timedelta, timezone
-from enum import Enum
from urllib.parse import urljoin
from bs4 import BeautifulSoup
-class ParserVersion(Enum):
- ALPHA = 1
-
-
-def make_chuni_jp_parser(identifier: str, parser: ParserVersion):
- def alpha_parser(html: str):
- """
- Confirmed on:
- VERSE
- """
- soup = BeautifulSoup(html, "html.parser")
- news_entries = []
- news_wrapper = soup.find("div", class_="newsMainWrapper-left")
- if not news_wrapper:
- return news_entries
- for a_tag in news_wrapper.find_all("a", href=True):
- if not a_tag.find("div", class_="chuniCommonBox-inner"):
- continue
- news_dict = {}
- news_url = a_tag.get("href")
- news_dict["url"] = news_url
+def parse_chuni_jp_news_site(html: str):
+ """
+ Confirmed on:
+ VERSE
+ """
+ identifier = "CHUNITHM_JP"
+ soup = BeautifulSoup(html, "html.parser")
+ news_entries = []
+ news_wrapper = soup.find("div", class_="newsMainWrapper-left")
+ if not news_wrapper:
+ return news_entries
+ for a_tag in news_wrapper.find_all("a", href=True):
+ if not a_tag.find("div", class_="chuniCommonBox-inner"):
+ continue
+ news_dict = {}
+ news_url = a_tag.get("href")
+ news_dict["url"] = news_url
- date_container = a_tag.find("div", class_="chuniCommonBox-inner-title")
- date_str = None
- if date_container:
- title_span = date_container.find("span", class_="title")
- if title_span:
- text = title_span.get_text(strip=True)
- date_match = re.search(r"(\d{4}\.\d{2}\.\d{2})", text)
- if date_match:
- date_str = date_match.group(1)
- news_dict["date"] = date_str
- news_dict["type"] = None
- timestamp = None
- if date_str:
- try:
- dt = datetime.strptime(date_str, "%Y.%m.%d")
- dt = dt.replace(tzinfo=timezone(timedelta(hours=9)))
- timestamp = int(dt.timestamp())
- except Exception:
- timestamp = None
- news_dict["timestamp"] = timestamp
+ date_container = a_tag.find("div", class_="chuniCommonBox-inner-title")
+ date_str = None
+ if date_container:
+ title_span = date_container.find("span", class_="title")
+ if title_span:
+ text = title_span.get_text(strip=True)
+ date_match = re.search(r"(\d{4}\.\d{2}\.\d{2})", text)
+ if date_match:
+ date_str = date_match.group(1)
+ news_dict["date"] = date_str
+ news_dict["type"] = None
+ timestamp = None
+ if date_str:
+ try:
+ dt = datetime.strptime(date_str, "%Y.%m.%d")
+ dt = dt.replace(tzinfo=timezone(timedelta(hours=9)))
+ timestamp = int(dt.timestamp())
+ except Exception:
+ timestamp = None
+ news_dict["timestamp"] = timestamp
- main_content = a_tag.find("div", class_="chuniCommonBox-inner-main")
- content_text = ""
- if main_content:
- content_text = main_content.get_text(separator=" ", strip=True)
- news_dict["content"] = content_text
+ main_content = a_tag.find("div", class_="chuniCommonBox-inner-main")
+ content_text = ""
+ if main_content:
+ content_text = main_content.get_text(separator=" ", strip=True)
+ news_dict["content"] = content_text
- images = {"image": None, "link": None}
- if main_content:
- img_tag = main_content.find("img")
- if img_tag:
- images["image"] = img_tag.get("src")
- images["link"] = news_url
- news_dict["images"] = [images]
- news_dict["identifier"] = identifier
- news_dict["is_ai_summary"] = False
+ images = {"image": None, "link": None}
+ if main_content:
+ img_tag = main_content.find("img")
+ if img_tag:
+ images["image"] = img_tag.get("src")
+ images["link"] = news_url
+ news_dict["images"] = [images]
+ news_dict["identifier"] = identifier
+ news_dict["is_ai_summary"] = False
- news_entries.append(news_dict)
+ news_entries.append(news_dict)
- return news_entries
+ return news_entries
- if parser == ParserVersion.ALPHA:
- return alpha_parser
-
-def make_image_extractor(version: ParserVersion):
+def parse_chuni_jp_post_images(html: str):
"""
- Gets all the images from a full post page as CHUNITHM intl has more relevant images
- hidden in the actual posts
+ Gets all the images from a full post page as CHUNITHM JP has more relevant images
+ hidden in the actual posts.
"""
+ base_url = "https://info-chunithm.sega.jp/"
+ soup = BeautifulSoup(html, "html.parser")
+ images = []
- def image_extractor_alpha(html: str):
- base_url = "https://info-chunithm.sega.jp/"
- soup = BeautifulSoup(html, "html.parser")
- images = []
-
- container = soup.select_one(".chuniCommonBox-inner-main")
- if not container:
- return images
- for img in container.find_all("img"):
- if img.find_parent("p") and "©" in img.find_parent("p").text:
- continue
-
- src = img.get("src") or img.get("data-src")
- if not src:
- continue
- full_url = urljoin(base_url, src)
- parent = img.find_parent("a")
- link = parent.get("href") if parent and parent.name == "a" else None
- images.append(
- {"image": full_url, "link": urljoin(base_url, link) if link else None}
- )
+ container = soup.select_one(".chuniCommonBox-inner-main")
+ if not container:
return images
+ for img in container.find_all("img"):
+ if img.find_parent("p") and "©" in img.find_parent("p").text:
+ continue
- if version == ParserVersion.ALPHA:
- return image_extractor_alpha
- else:
- raise ValueError("Unknown Parser Version")
-
-
-parse_chuni_jp_news_site = make_chuni_jp_parser(
- "CHUNITHM_JP", ParserVersion.ALPHA
-)
-parse_chuni_jp_post_images = make_image_extractor(ParserVersion.ALPHA)
+ src = img.get("src") or img.get("data-src")
+ if not src:
+ continue
+ full_url = urljoin(base_url, src)
+ parent = img.find_parent("a")
+ link = parent.get("href") if parent and parent.name == "a" else None
+ images.append(
+ {"image": full_url, "link": urljoin(base_url, link) if link else None}
+ )
+ return images
diff --git a/sega/maimaidx_intl.py b/sega/maimaidx_intl.py
index 3e26a37..8182117 100644
--- a/sega/maimaidx_intl.py
+++ b/sega/maimaidx_intl.py
@@ -1,53 +1,7 @@
from bs4 import BeautifulSoup
from datetime import datetime, timezone, timedelta
-from enum import Enum
import json
-class ParserVersion(Enum):
- ALPHA=1
-
-def make_maimaidx_intl_parser(identifier: str, parser: ParserVersion):
- """
- Parses the download page of maimai dx intl site. API route method below is preferred as information is the same
- """
- def alpha_parser(html: str):
- """
- Confirmed on:
- PRISM
- """
- soup = BeautifulSoup(html, "html.parser")
- items = soup.select(".dl--pop__item")
-
- entries = []
- for item in items:
- date_text = item.select_one(".dl--pop__head").text.strip().replace(" UP", "")
- dt = datetime.strptime(date_text, "%Y.%m.%d").replace(tzinfo=timezone(timedelta(hours=9)))
- timestamp = int(dt.timestamp())
-
- img_tag = item.select_one("a.dl--pop__thumb img")
- image_url = img_tag["srcset"] if img_tag else None
- full_image_url = image_url.replace("../", "https://maimai.sega.com/") if image_url else None
-
- entry = {
- "date": date_text,
- "identifier": identifier,
- "type": None,
- "timestamp": timestamp,
- "headline": None,
- "content": f"New maimai DX International News / maimai DX International の新しいお知らせ\n\n{full_image_url}",
- "url": None,
- "images": [
- {
- "image": full_image_url,
- "link": None
- }
- ],
- 'is_ai_summary': False
- }
- entries.append(entry)
- return entries
- if parser == ParserVersion.ALPHA:
- return alpha_parser
def parse_maimaidx_intl_api_route(raw_api_data: str, identifier: str, limit: int):
route_data = json.loads(raw_api_data)
@@ -84,6 +38,3 @@ def parse_maimaidx_intl_api_route(raw_api_data: str, identifier: str, limit: int
}
entries.append(entry)
return entries
-
-
-parse_maimaidx_intl_news_site = make_maimaidx_intl_parser("MAIMAIDX_INTL", ParserVersion.ALPHA)
diff --git a/sega/maimaidx_jp.py b/sega/maimaidx_jp.py
index 1314325..2b61c9a 100644
--- a/sega/maimaidx_jp.py
+++ b/sega/maimaidx_jp.py
@@ -1,60 +1,53 @@
from bs4 import BeautifulSoup
from datetime import datetime, timezone, timedelta
from urllib.parse import urljoin
-from enum import Enum
-class ParserVersion(Enum):
- ALPHA=1
-def make_maimaidx_jpn_parser(identifier: str, parser: ParserVersion):
- def alpha_parser(html: str):
- """
- Confirmed on:
- PRISM PLUS
- """
- soup = BeautifulSoup(html, "html.parser")
- base_url = "https://info-maimai.sega.jp/"
- news_items = []
+def parse_maimaidx_jp_news_site(html: str):
+ """
+ Confirmed on:
+ PRISM PLUS
+ """
+ identifier = "MAIMAIDX_JP"
+ soup = BeautifulSoup(html, "html.parser")
+ base_url = "https://info-maimai.sega.jp/"
+ news_items = []
- news_boxes = soup.select(".maiPager-content .newsBox")
- for box in news_boxes:
- a_tag = box.select_one("a")
- url = urljoin(base_url, a_tag["href"]) if a_tag and a_tag.get("href") else None
+ news_boxes = soup.select(".maiPager-content .newsBox")
+ for box in news_boxes:
+ a_tag = box.select_one("a")
+ url = urljoin(base_url, a_tag["href"]) if a_tag and a_tag.get("href") else None
- img_tag = box.select_one("img")
- image_url = urljoin(base_url, img_tag["src"]) if img_tag else None
+ img_tag = box.select_one("img")
+ image_url = urljoin(base_url, img_tag["src"]) if img_tag else None
- date_tag = box.select_one(".newsDate")
- raw_date = date_tag.get_text(strip=True) if date_tag else None
+ date_tag = box.select_one(".newsDate")
+ raw_date = date_tag.get_text(strip=True) if date_tag else None
- jst = timezone(timedelta(hours=9))
- try:
- dt = datetime.strptime(raw_date.split(" ")[0], "%Y.%m.%d").replace(tzinfo=jst)
- timestamp = int(dt.timestamp())
- except:
- dt = None
- timestamp = 0
+ jst = timezone(timedelta(hours=9))
+ try:
+ dt = datetime.strptime(raw_date.split(" ")[0], "%Y.%m.%d").replace(tzinfo=jst)
+ timestamp = int(dt.timestamp())
+ except Exception:
+ dt = None
+ timestamp = 0
- content_tag = box.select_one(".newsLink")
- content = content_tag.get_text(strip=True) if content_tag else None
+ content_tag = box.select_one(".newsLink")
+ content = content_tag.get_text(strip=True) if content_tag else None
- news_items.append({
- "date": raw_date,
- "identifier": identifier,
- "type": None,
- "timestamp": timestamp,
- "headline": None,
- "content": content,
- "url": url,
- 'is_ai_summary': False,
- "images": [{
- "image": image_url,
- "link": url
- }] if image_url else []
- })
+ news_items.append({
+ "date": raw_date,
+ "identifier": identifier,
+ "type": None,
+ "timestamp": timestamp,
+ "headline": None,
+ "content": content,
+ "url": url,
+ "is_ai_summary": False,
+ "images": [{
+ "image": image_url,
+ "link": url
+ }] if image_url else []
+ })
- return news_items
- if parser == ParserVersion.ALPHA:
- return alpha_parser
-
-parse_maimaidx_jp_news_site = make_maimaidx_jpn_parser("MAIMAIDX_JP", ParserVersion.ALPHA)
+ return news_items \ No newline at end of file
diff --git a/sega/ongeki_jp.py b/sega/ongeki_jp.py
index f9c2dc4..c173189 100644
--- a/sega/ongeki_jp.py
+++ b/sega/ongeki_jp.py
@@ -1,68 +1,58 @@
-import time
-from datetime import datetime
-from enum import Enum
+from datetime import datetime, timezone, timedelta
from bs4 import BeautifulSoup
+JST = timezone(timedelta(hours=9))
-class ParserVersion(Enum):
- ALPHA = 1
+def parse_ongeki_news_site(html: str):
+ identifier = "ONGEKI_JPN"
+ soup = BeautifulSoup(html, "html.parser")
+ items = []
-def make_ongeki_parser(identifier: str, parser: ParserVersion):
- def alpha_parser(html: str):
- soup = BeautifulSoup(html, "html.parser")
- items = []
+ for li in soup.select("li.p-news__listChild"):
+ a_tag = li.select_one("a.p-news__listLink")
+ url = a_tag["href"] if a_tag else None
- for li in soup.select("li.p-news__listChild"):
- a_tag = li.select_one("a.p-news__listLink")
- url = a_tag["href"] if a_tag else None
+ img_tag = li.select_one(".p-news__listThumb img")
+ image_url = img_tag["src"] if img_tag else None
+ image_alt = img_tag["alt"] if img_tag else ""
+ image_link = url if image_url else None
- img_tag = li.select_one(".p-news__listThumb img")
- image_url = img_tag["src"] if img_tag else None
- image_alt = img_tag["alt"] if img_tag else ""
- image_link = url if image_url else None
+ date_type_text = li.select_one(".p-news__listTextUpper")
+ date_text = (
+ date_type_text.text.strip().split("/")[0].strip()
+ if date_type_text
+ else None
+ )
+ type_text = (
+ date_type_text.text.strip().split("/")[-1].strip()
+ if date_type_text and "/" in date_type_text.text
+ else None
+ )
- date_type_text = li.select_one(".p-news__listTextUpper")
- date_text = (
- date_type_text.text.strip().split("/")[0].strip()
- if date_type_text
- else None
- )
- type_text = (
- date_type_text.text.strip().split("/")[-1].strip()
- if "/" in date_type_text.text
- else None
- )
+ timestamp = None
+ if date_text:
+ try:
+ dt = datetime.strptime(date_text, "%Y.%m.%d %a").replace(tzinfo=JST)
+ timestamp = int(dt.timestamp())
+ except Exception:
+ timestamp = None
- timestamp = None
- if date_text:
- try:
- dt = datetime.strptime(date_text, "%Y.%m.%d %a")
- timestamp = int(time.mktime(dt.timetuple()))
- except:
- timestamp = None
+ entry = {
+ "date": date_text,
+ "identifier": identifier,
+ "type": type_text if type_text not in ["GAME", "CARDMAKER"] else None,
+ "timestamp": timestamp,
+ "headline": None,
+ "content": image_alt,
+ "url": url,
+ "is_ai_summary": False,
+ "images": [{"image": image_url, "link": image_link}]
+ if image_url
+ else [],
+ }
- entry = {
- "date": date_text,
- "identifier": identifier,
- "type": type_text if type_text not in ["GAME", "CARDMAKER"] else None,
- "timestamp": timestamp,
- "headline": None,
- "content": image_alt,
- "url": url,
- "is_ai_summary": False,
- "images": [{"image": image_url, "link": image_link}]
- if image_url
- else [],
- }
+ items.append(entry)
- items.append(entry)
-
- return items
-
- if parser == ParserVersion.ALPHA:
- return alpha_parser
-
-
-parse_ongeki_news_site = make_ongeki_parser("ONGEKI_JPN", ParserVersion.ALPHA)
+ return items \ No newline at end of file
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage