aboutsummaryrefslogtreecommitdiffstats
path: root/bandai_namco/taiko.py
blob: a417a45aca9c95b5c88c86e5606a905389de86e6 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from bs4 import BeautifulSoup
from datetime import datetime
import time
import re

def parse_taiko_blog_site(html: str) -> list:
    base_url: str = "https://taiko-ch.net"
    soup = BeautifulSoup(html, "html.parser")

    entries = []

    for article in soup.select("article"):
        try:
            # Get date and timestamp
            date_tag = article.select_one("p.entryDate")
            if not date_tag:
                continue
            date_str = date_tag.text.strip()
            date_obj = datetime.strptime(date_str, "%Y年%m月%d日")
            timestamp = int(time.mktime(date_obj.timetuple()))
            url_date = date_obj.strftime("%Y%m%d")
            url = base_url + "/?m="+url_date

            # Get headline
            headline_tag = article.select_one("h1")
            headline = headline_tag.text.strip() if headline_tag else None

            # Get subheaders
            content = []
            for div in article.find_all("div", style=re.compile(r"background:\s?#ff4500")):
                title_text = div.get_text(strip=True).replace("■", "").strip()
                if title_text:
                    content.append(f"• {title_text}")

            # Get images
            images = []
            for img in article.find_all("img"):
                img_url = img.get("src") or img.get("data-src")
                if img_url:
                    if img_url.startswith("/"):
                        img_url = base_url + img_url
                    images.append({"image": img_url, "link": None})

            entry = {
                "date": date_str,
                "identifier": "TAIKO",
                "type": None,
                "timestamp": timestamp,
                "headline": headline,
                "content": "\n".join(content),
                "url": url,
                "images": images,
                'is_ai_summary': False
            }

            entries.append(entry)
        except Exception as e:
            print(f"Error parsing article: {e}")
            continue

    return entries
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage