blob: 0aa2e0e2421944f6f64d65ed59417d10a580d236 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
|
from bs4 import BeautifulSoup
from datetime import datetime
import time
import re
def parse_taiko_blog_site(html: str) -> list:
base_url: str = "https://taiko-ch.net"
soup = BeautifulSoup(html, "html.parser")
entries = []
for article in soup.select("article"):
try:
# Get date and timestamp
date_tag = article.select_one("p.entryDate")
if not date_tag:
continue
date_str = date_tag.text.strip()
date_obj = datetime.strptime(date_str, "%Y年%m月%d日")
timestamp = int(time.mktime(date_obj.timetuple()))
# Get headline
headline_tag = article.select_one("h1")
headline = headline_tag.text.strip() if headline_tag else None
# Get subheaders
content = []
for div in article.find_all("div", style=re.compile(r"background:\s?#ff4500")):
title_text = div.get_text(strip=True).replace("■", "").strip()
if title_text:
content.append(f"• {title_text}")
# Get images
images = []
for img in article.find_all("img"):
img_url = img.get("src") or img.get("data-src")
if img_url:
if img_url.startswith("/"):
img_url = base_url + img_url
images.append({"image": img_url, "link": None})
entry = {
"date": date_str,
"identifier": "TAIKO",
"type": None,
"timestamp": timestamp,
"headline": headline,
"content": "\n".join(content),
"url": None,
"images": images
}
entries.append(entry)
except Exception as e:
print(f"Error parsing article: {e}")
continue
return entries
|