diff options
Diffstat (limited to 'bemani')
| -rw-r--r-- | bemani/__init__.py | 7 | ||||
| -rw-r--r-- | bemani/ddr.py | 63 | ||||
| -rw-r--r-- | bemani/iidx.py | 68 |
3 files changed, 7 insertions, 131 deletions
diff --git a/bemani/__init__.py b/bemani/__init__.py new file mode 100644 index 0000000..f16ed0a --- /dev/null +++ b/bemani/__init__.py @@ -0,0 +1,7 @@ +from bemani.sdvx import parse_exceed_gear_news_site +from bemani.polaris_chord import parse_polaris_chord_news_site + +__all__ = [ + "parse_exceed_gear_news_site", + "parse_polaris_chord_news_site", +] diff --git a/bemani/ddr.py b/bemani/ddr.py deleted file mode 100644 index b5ae93c..0000000 --- a/bemani/ddr.py +++ /dev/null @@ -1,63 +0,0 @@ -""" -Currently unused as e-eamusement app feed is favored. Here for archival purposes -""" -from bs4 import BeautifulSoup -from datetime import datetime -from urllib.parse import urljoin -import time - -def parse_ddr_world_news_site(html: str): - base_url = "https://p.eagate.573.jp" - soup = BeautifulSoup(html, 'html.parser') - news_entries = [] - - for div in soup.select("div#info > div.news_one"): - style = div.get('style', '') - if 'none' in style: - continue - - title_tag = div.select_one("div.news_title > div.title") - date_tag = div.select_one("div.news_title > div.date") - headline = title_tag.get_text(strip=True) if title_tag else None - date_str = date_tag.get_text(strip=True) if date_tag else None - - try: - dt = datetime.strptime(date_str, "%Y/%m/%d") - date_iso = dt.strftime("%Y-%m-%d") - timestamp = int(time.mktime(dt.timetuple())) - except Exception: - date_iso, timestamp = None, None - paras = [p.get_text(strip=True, separator="\n\n") - for p in div.find_all("p", recursive=False)] - if not paras: - for child in div.find_all(recursive=False): - cls = child.get("class", []) - if "news_title" in cls or "img_news_center" in cls: - continue - if child.name == "div": - paras.append(child.get_text(strip=True, separator="\n\n")) - - content = "\n\n\n".join(paras) if paras else None - if content: - content = f"\n{content}\n" - - images = [] - for img in div.select("div.img_news_center img"): - raw_src = img.get("data-src") or img.get("src") - if raw_src: - full_url = urljoin(base_url, raw_src) - images.append({"image": full_url, "link": None}) - - news_entries.append({ - "date": date_iso, - "identifier": "DDR", - "type": None, - "timestamp": timestamp, - "headline": headline, - "content": content, - "url": None, - "images": images, - 'is_ai_summary': False - }) - - return news_entries diff --git a/bemani/iidx.py b/bemani/iidx.py deleted file mode 100644 index de7f34c..0000000 --- a/bemani/iidx.py +++ /dev/null @@ -1,68 +0,0 @@ -from bs4 import BeautifulSoup -from datetime import datetime -from urllib.parse import urljoin -import re - -KEY_TERMS_TL = [ - ("クプロ", "QPro") -] - -# Legacy code. e-amuse feed provides better data -def parse_pinky_crush_news_site(html: str): - base_url = "https://p.eagate.573.jp" - type_map = { - "i_01": "NEWSONG", - "i_02": "RANKING", - "i_03": "EVENT", - "i_04": "SHOP", - "i_05": "OTHER" - } - soup = BeautifulSoup(html, "html.parser") - news_items = [] - - for li in soup.select("#info-news > li"): - date_elem = li.select_one(".news-main > li:nth-of-type(1)") - headline_elem = li.select_one(".news-main > li:nth-of-type(2)") - content_elem = li.select_one(".news-main > li:nth-of-type(3)") - type_class = li.get("class", [None])[0] - if not (date_elem and content_elem): - continue - date_str = date_elem.text.strip() - try: - dt = datetime.strptime(date_str, "%Y/%m/%d") - timestamp = int(dt.timestamp()) - except ValueError: - timestamp = None - - headline = headline_elem.a.text.strip() if headline_elem.a else headline_elem.text.strip() - - for a in content_elem.select("a[href]"): - href = urljoin(base_url, a["href"]) - text = a.get_text(strip=True) - a.replace_with(f"[{text}]({href})") - - for br in content_elem.find_all("br"): - br.replace_with("\n") - - content = content_elem.get_text().strip() - - content = content.replace( - " e-amusement ベーシックコース ", - " e-amusement ベーシックコース " - ) - content = content.replace("※", "\n※") - content = re.sub(r"\n[ \t]+", "\n", content) - content = re.sub(r'\s*/\s*', '/', content) - news_items.append({ - "date": date_str, - "identifier": "IIDX", - "type": type_map[type_class], - "timestamp": timestamp, - "headline": headline, - "content": content, - "url": None, - "images": [], - 'is_ai_summary': False - }) - - return news_items |
