taito/music_diver.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58

import json
from bs4 import BeautifulSoup
import re
from datetime import datetime

def _parse_html_content(html: str):
    soup = BeautifulSoup(html, "html.parser")
    images = []
    for img in soup.find_all("img"):
        parent = img.find_parent("a")
        image_info = {
            "image": img["src"],
            "link": parent["href"] if parent else None
        }
        images.append(image_info)
        img.decompose()
    for br in soup.find_all("br"):
        br.replace_with("\n\n")
    for a in soup.find_all("a"):
        text = a.get_text()
        href = a.get("href")
        if href:
            markdown = f"[{text}]({href})"
            a.replace_with(f" {markdown} ")
        else:
            a.unwrap()
            a.insert_after(" ")
    for tag in soup.find_all(True):
        tag.insert_after(" ")
        tag.unwrap()
    text = soup.get_text()
    text = re.sub(r"\n{3,}", "\n\n", text).strip()
    return text, images

def parse_music_diver_news_json(data_str: str):
    data = json.loads(data_str)
    if data["responseCode"] != 200:
        return []

    news_posts = []
    for post in data["response"]:
        content, images = _parse_html_content(post["content"])
        show_date = datetime.fromisoformat(post["show_start"].replace("Z", "+00:00"))
        jst_date = show_date.strftime("%Y-%m-%d")
        timestamp = int(show_date.timestamp())

        news_posts.append({
            "date": jst_date,
            "identifier": "MUSIC_DIVER",
            "type": None,
            "timestamp": timestamp,
            "headline": post["title"],
            "content": content,
            "url": None,
            "images": images,
            "is_ai_summary": False
        })
    return news_posts