diff options
| -rw-r--r-- | sega/chuni_jp.py | 111 | ||||
| -rw-r--r-- | sega/maimaidx_intl.py | 69 | ||||
| -rw-r--r-- | sega/maimaidx_jp.py | 85 | ||||
| -rw-r--r-- | sega/ongeki_jp.py | 84 |
4 files changed, 199 insertions, 150 deletions
diff --git a/sega/chuni_jp.py b/sega/chuni_jp.py index bdbe800..981fb8f 100644 --- a/sega/chuni_jp.py +++ b/sega/chuni_jp.py @@ -2,54 +2,69 @@ from bs4 import BeautifulSoup from datetime import datetime, timezone, timedelta from urllib.parse import urljoin import re +from enum import Enum -def parse_chuni_jp_verse_news_site(html: str): - soup = BeautifulSoup(html, "html.parser") - news_entries = [] - news_wrapper = soup.find("div", class_="newsMainWrapper-left") - if not news_wrapper: - return news_entries - for a_tag in news_wrapper.find_all("a", href=True): - if not a_tag.find("div", class_="chuniCommonBox-inner"): - continue - news_dict = {} - news_url = a_tag.get("href") - news_dict["url"] = news_url - date_container = a_tag.find("div", class_="chuniCommonBox-inner-title") - date_str = None - if date_container: - title_span = date_container.find("span", class_="title") - if title_span: - text = title_span.get_text(strip=True) - date_match = re.search(r"(\d{4}\.\d{2}\.\d{2})", text) - if date_match: - date_str = date_match.group(1) - news_dict["date"] = date_str - news_dict["type"] = None - timestamp = None - if date_str: - try: - dt = datetime.strptime(date_str, "%Y.%m.%d") - dt = dt.replace(tzinfo=timezone(timedelta(hours=9))) - timestamp = int(dt.timestamp()) - except Exception: - timestamp = None - news_dict["timestamp"] = timestamp - main_content = a_tag.find("div", class_="chuniCommonBox-inner-main") - headline = None - content_text = "" - if main_content: - content_text = main_content.get_text(separator=" ", strip=True) - news_dict["content"] = content_text - images = {"image": None, "link": None} - if main_content: - img_tag = main_content.find("img") - if img_tag: - images["image"] = img_tag.get("src") - images["link"] = news_url - news_dict["images"] = [images] - news_dict["identifier"] = "CHUNITHM_JP_VERSE" +class ParserVersion(Enum): + ALPHA=1 + +def make_chuni_jp_parser(identifier: str, parser: ParserVersion): + def alpha_parser(html: str): + """ + Confirmed on: + VERSE + """ + soup = BeautifulSoup(html, "html.parser") + news_entries = [] + news_wrapper = soup.find("div", class_="newsMainWrapper-left") + if not news_wrapper: + return news_entries + for a_tag in news_wrapper.find_all("a", href=True): + if not a_tag.find("div", class_="chuniCommonBox-inner"): + continue + news_dict = {} + news_url = a_tag.get("href") + news_dict["url"] = news_url + + date_container = a_tag.find("div", class_="chuniCommonBox-inner-title") + date_str = None + if date_container: + title_span = date_container.find("span", class_="title") + if title_span: + text = title_span.get_text(strip=True) + date_match = re.search(r"(\d{4}\.\d{2}\.\d{2})", text) + if date_match: + date_str = date_match.group(1) + news_dict["date"] = date_str + news_dict["type"] = None + timestamp = None + if date_str: + try: + dt = datetime.strptime(date_str, "%Y.%m.%d") + dt = dt.replace(tzinfo=timezone(timedelta(hours=9))) + timestamp = int(dt.timestamp()) + except Exception: + timestamp = None + news_dict["timestamp"] = timestamp + + main_content = a_tag.find("div", class_="chuniCommonBox-inner-main") + content_text = "" + if main_content: + content_text = main_content.get_text(separator=" ", strip=True) + news_dict["content"] = content_text - news_entries.append(news_dict) + images = {"image": None, "link": None} + if main_content: + img_tag = main_content.find("img") + if img_tag: + images["image"] = img_tag.get("src") + images["link"] = news_url + news_dict["images"] = [images] + news_dict["identifier"] = identifier + + news_entries.append(news_dict) + + return news_entries + if parser == ParserVersion.ALPHA: + return alpha_parser - return news_entries +parse_chuni_jp_verse_news_site = make_chuni_jp_parser("CHUNITHM_JP_VERSE", ParserVersion.ALPHA) diff --git a/sega/maimaidx_intl.py b/sega/maimaidx_intl.py index 57e7cfb..1671d9f 100644 --- a/sega/maimaidx_intl.py +++ b/sega/maimaidx_intl.py @@ -1,36 +1,47 @@ from bs4 import BeautifulSoup from datetime import datetime, timezone, timedelta -import time +from enum import Enum -def parse_maimaidx_intl_prism_news_site(html: str): - soup = BeautifulSoup(html, "html.parser") - items = soup.select(".dl--pop__item") +class ParserVersion(Enum): + ALPHA=1 - entries = [] - for item in items: - date_text = item.select_one(".dl--pop__head").text.strip().replace(" UP", "") - dt = datetime.strptime(date_text, "%Y.%m.%d").replace(tzinfo=timezone(timedelta(hours=9))) - timestamp = int(dt.timestamp()) +def make_maimaidx_intl_parser(identifier: str, parser: ParserVersion): + def alpha_parser(html: str): + """ + Confirmed on: + PRISM + """ + soup = BeautifulSoup(html, "html.parser") + items = soup.select(".dl--pop__item") - img_tag = item.select_one("a.dl--pop__thumb img") - image_url = img_tag["srcset"] if img_tag else None - full_image_url = image_url.replace("../", "https://maimai.sega.com/") if image_url else None + entries = [] + for item in items: + date_text = item.select_one(".dl--pop__head").text.strip().replace(" UP", "") + dt = datetime.strptime(date_text, "%Y.%m.%d").replace(tzinfo=timezone(timedelta(hours=9))) + timestamp = int(dt.timestamp()) - entry = { - "date": date_text, - "identifier": "MAIMAIDX_INTL_PRISM", - "type": None, - "timestamp": timestamp, - "headline": None, - "content": f"New maimai DX International News / maimai DX International の新しいお知らせ\n\n{full_image_url}", - "url": None, - "images": [ - { - "image": full_image_url, - "link": None - } - ] - } + img_tag = item.select_one("a.dl--pop__thumb img") + image_url = img_tag["srcset"] if img_tag else None + full_image_url = image_url.replace("../", "https://maimai.sega.com/") if image_url else None - entries.append(entry) - return entries + entry = { + "date": date_text, + "identifier": identifier, + "type": None, + "timestamp": timestamp, + "headline": None, + "content": f"New maimai DX International News / maimai DX International の新しいお知らせ\n\n{full_image_url}", + "url": None, + "images": [ + { + "image": full_image_url, + "link": None + } + ] + } + entries.append(entry) + return entries + if parser == ParserVersion.ALPHA: + return alpha_parser + +parse_maimaidx_intl_prism_news_site = make_maimaidx_intl_parser("MAIMAIDX_INTL_PRISM", ParserVersion.ALPHA) diff --git a/sega/maimaidx_jp.py b/sega/maimaidx_jp.py index 90530f0..720a618 100644 --- a/sega/maimaidx_jp.py +++ b/sega/maimaidx_jp.py @@ -1,46 +1,59 @@ from bs4 import BeautifulSoup from datetime import datetime, timezone, timedelta from urllib.parse import urljoin -import re +from enum import Enum -def parse_maimaidx_jp_prism_plus_news_site(html: str): - soup = BeautifulSoup(html, "html.parser") - base_url = "https://info-maimai.sega.jp/" - news_items = [] +class ParserVersion(Enum): + ALPHA=1 - news_boxes = soup.select(".maiPager-content .newsBox") - for box in news_boxes: - a_tag = box.select_one("a") - url = urljoin(base_url, a_tag["href"]) if a_tag and a_tag.get("href") else None +def make_maimaidx_jpn_parser(identifier: str, parser: ParserVersion): + def alpha_parser(html: str): + """ + Confirmed on: + PRISM PLUS + """ + soup = BeautifulSoup(html, "html.parser") + base_url = "https://info-maimai.sega.jp/" + news_items = [] - img_tag = box.select_one("img") - image_url = urljoin(base_url, img_tag["src"]) if img_tag else None + news_boxes = soup.select(".maiPager-content .newsBox") + for box in news_boxes: + a_tag = box.select_one("a") + url = urljoin(base_url, a_tag["href"]) if a_tag and a_tag.get("href") else None - date_tag = box.select_one(".newsDate") - raw_date = date_tag.get_text(strip=True) if date_tag else None + img_tag = box.select_one("img") + image_url = urljoin(base_url, img_tag["src"]) if img_tag else None - jst = timezone(timedelta(hours=9)) - try: - dt = datetime.strptime(raw_date.split(" ")[0], "%Y.%m.%d").replace(tzinfo=jst) - timestamp = int(dt.timestamp()) - except: - dt = None - timestamp = 0 + date_tag = box.select_one(".newsDate") + raw_date = date_tag.get_text(strip=True) if date_tag else None - content_tag = box.select_one(".newsLink") - content = content_tag.get_text(strip=True) if content_tag else None - news_items.append({ - "date": raw_date, - "identifier": "MAIMAIDX_JPN_PRISM_PLUS", - "type": None, - "timestamp": timestamp, - "headline": None, - "content": content, - "url": url, - "images": [{ - "image": image_url, - "link": url - }] if image_url else [] - }) + jst = timezone(timedelta(hours=9)) + try: + dt = datetime.strptime(raw_date.split(" ")[0], "%Y.%m.%d").replace(tzinfo=jst) + timestamp = int(dt.timestamp()) + except: + dt = None + timestamp = 0 - return news_items + content_tag = box.select_one(".newsLink") + content = content_tag.get_text(strip=True) if content_tag else None + + news_items.append({ + "date": raw_date, + "identifier": identifier, + "type": None, + "timestamp": timestamp, + "headline": None, + "content": content, + "url": url, + "images": [{ + "image": image_url, + "link": url + }] if image_url else [] + }) + + return news_items + if parser == ParserVersion.ALPHA: + return alpha_parser + +parse_maimaidx_jp_prism_plus_news_site = make_maimaidx_jpn_parser("MAIMAIDX_JPN_PRISM_PLUS", ParserVersion.ALPHA) diff --git a/sega/ongeki_jp.py b/sega/ongeki_jp.py index 587f358..a2a05fb 100644 --- a/sega/ongeki_jp.py +++ b/sega/ongeki_jp.py @@ -1,48 +1,58 @@ from bs4 import BeautifulSoup from datetime import datetime import time +from enum import Enum -def parse_ongeki_refresh_news_site(html: str): - soup = BeautifulSoup(html, "html.parser") - items = [] +class ParserVersion(Enum): + ALPHA=1 - for li in soup.select("li.p-news__listChild"): - a_tag = li.select_one("a.p-news__listLink") - url = a_tag["href"] if a_tag else None +def make_ongeki_parser(identifier: str, parser: ParserVersion): + def alpha_parser(html: str): + soup = BeautifulSoup(html, "html.parser") + items = [] - img_tag = li.select_one(".p-news__listThumb img") - image_url = img_tag["src"] if img_tag else None - image_alt = img_tag["alt"] if img_tag else "" - image_link = url if image_url else None + for li in soup.select("li.p-news__listChild"): + a_tag = li.select_one("a.p-news__listLink") + url = a_tag["href"] if a_tag else None - date_type_text = li.select_one(".p-news__listTextUpper") - date_text = date_type_text.text.strip().split("/")[0].strip() if date_type_text else None - type_text = date_type_text.text.strip().split("/")[-1].strip() if "/" in date_type_text.text else None + img_tag = li.select_one(".p-news__listThumb img") + image_url = img_tag["src"] if img_tag else None + image_alt = img_tag["alt"] if img_tag else "" + image_link = url if image_url else None - headline_tag = li.select_one(".p-news__listTextUnder") - headline = headline_tag.text.strip() if headline_tag else None + date_type_text = li.select_one(".p-news__listTextUpper") + date_text = date_type_text.text.strip().split("/")[0].strip() if date_type_text else None + type_text = date_type_text.text.strip().split("/")[-1].strip() if "/" in date_type_text.text else None - timestamp = None - if date_text: - try: - dt = datetime.strptime(date_text, "%Y.%m.%d %a") - timestamp = int(time.mktime(dt.timetuple())) - except: - timestamp = None - entry = { - "date": date_text, - "identifier": "ONGEKI_JPN_REFRESH", - "type": type_text if type_text not in ["GAME", "CARDMAKER"] else None, - "timestamp": timestamp, - "headline": None, - "content": image_alt, - "url": url, - "images": [{ - "image": image_url, - "link": image_link - }] if image_url else [] - } + headline_tag = li.select_one(".p-news__listTextUnder") + headline = headline_tag.text.strip() if headline_tag else None - items.append(entry) + timestamp = None + if date_text: + try: + dt = datetime.strptime(date_text, "%Y.%m.%d %a") + timestamp = int(time.mktime(dt.timetuple())) + except: + timestamp = None - return items + entry = { + "date": date_text, + "identifier": identifier, + "type": type_text if type_text not in ["GAME", "CARDMAKER"] else None, + "timestamp": timestamp, + "headline": None, + "content": image_alt, + "url": url, + "images": [{ + "image": image_url, + "link": image_link + }] if image_url else [] + } + + items.append(entry) + + return items + if parser == ParserVersion.ALPHA: + return alpha_parser + +parse_ongeki_refresh_news_site = make_ongeki_parser("ONGEKI_JPN_REFRESH", ParserVersion.ALPHA) |
