diff options
| author | Pinapelz <yukais@pinapelz.com> | 2025-04-17 15:31:37 -0700 |
|---|---|---|
| committer | Pinapelz <yukais@pinapelz.com> | 2025-04-17 15:31:37 -0700 |
| commit | a87715649b4fdfbd549aad493fb262f91f563325 (patch) | |
| tree | ceb6179c9080f219451bb1008ec729d4e3b713e8 /taito | |
| parent | ead6f998b47ff9e9f69ab636a995cbb30acdb775 (diff) | |
add MUSIC DIVER support
Diffstat (limited to 'taito')
| -rw-r--r-- | taito/music_diver.py | 57 |
1 files changed, 57 insertions, 0 deletions
diff --git a/taito/music_diver.py b/taito/music_diver.py new file mode 100644 index 0000000..5469ad5 --- /dev/null +++ b/taito/music_diver.py @@ -0,0 +1,57 @@ +import json +from bs4 import BeautifulSoup +import re +from datetime import datetime + +def _parse_html_content(html: str): + soup = BeautifulSoup(html, "html.parser") + images = [] + for img in soup.find_all("img"): + parent = img.find_parent("a") + image_info = { + "image": img["src"], + "link": parent["href"] if parent else None + } + images.append(image_info) + img.decompose() + for br in soup.find_all("br"): + br.replace_with("\n\n") + for a in soup.find_all("a"): + text = a.get_text() + href = a.get("href") + if href: + markdown = f"[{text}]({href})" + a.replace_with(f" {markdown} ") + else: + a.unwrap() + a.insert_after(" ") + for tag in soup.find_all(True): + tag.insert_after(" ") + tag.unwrap() + text = soup.get_text() + text = re.sub(r"\n{3,}", "\n\n", text).strip() + return text, images + +def parse_music_diver_news_json(data_str: str): + data = json.loads(data_str) + if data["responseCode"] != 200: + return [] + + news_posts = [] + for post in data["response"]: + content, images = _parse_html_content(post["content"]) + show_date = datetime.fromisoformat(post["show_start"].replace("Z", "+00:00")) + jst_date = show_date.strftime("%Y-%m-%d") + timestamp = int(show_date.timestamp()) + + news_posts.append({ + "date": jst_date, + "identifier": "MUSIC_DIVER", + "type": None, + "timestamp": timestamp, + "headline": post["title"], + "content": content, + "url": None, + "images": images + }) + return news_posts |
