bemani/iidx.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61

from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import urljoin
import re


def parse_pinky_crush_news_site(html: str, base_url):
    type_map = {
        "i_01": "NEWSONG",
        "i_02": "RANKING",
        "i_03": "EVENT",
        "i_04": "SHOP",
        "i_05": "OTHER"
    }
    soup = BeautifulSoup(html, "html.parser")
    news_items = []

    for li in soup.select("#info-news > li"):
        date_elem = li.select_one(".news-main > li:nth-of-type(1)")
        headline_elem = li.select_one(".news-main > li:nth-of-type(2)")
        content_elem = li.select_one(".news-main > li:nth-of-type(3)")
        type_class = li.get("class", [None])[0]
        if not (date_elem and content_elem):
            continue
        date_str = date_elem.text.strip()
        try:
            dt = datetime.strptime(date_str, "%Y/%m/%d")
            timestamp = int(dt.timestamp())
        except ValueError:
            timestamp = None

        headline = headline_elem.a.text.strip() if headline_elem.a else headline_elem.text.strip()

        for a in content_elem.select("a[href]"):
            href = urljoin(base_url, a["href"])
            text = a.get_text(strip=True)
            a.replace_with(f"[{text}]({href})")

        for br in content_elem.find_all("br"):
            br.replace_with("\n")

        content = content_elem.get_text().strip()

        content = content.replace(
            "                              e-amusement ベーシックコース                          ",
            " e-amusement ベーシックコース "
        )
        content = content.replace("※", "\n※")
        content = re.sub(r"\n[ \t]+", "\n", content)
        content = re.sub(r'\s*/\s*', '/', content)
        news_items.append({
            "date": date_str,
            "type": type_map[type_class],
            "timestamp": timestamp,
            "headline": headline,
            "content": content,
            "url": None,
            "images": [],
        })

    return news_items