1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
|
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import urljoin
import time
BASE_URL = "https://eam.573.jp"
def parse_news_page(html: str, identifier: str):
soup = BeautifulSoup(html, "html.parser")
entries = []
for li in soup.select("ul > li.ef"):
a_tag = li.find("a", href=True)
url = urljoin(BASE_URL, a_tag["href"]) if a_tag else None
date_text = li.select_one(".post-date")
if not date_text:
continue
raw_date = date_text.get_text(strip=True).replace("年", "/").replace("月", "/").replace("日", "")
try:
date_obj = datetime.strptime(raw_date, "%Y/%m/%d")
except ValueError:
continue
date_str = date_obj.strftime("%Y-%m-%d")
timestamp = int(time.mktime(date_obj.timetuple()))
content_tag = li.select_one(".article-text")
content = content_tag.get_text(strip=True) if content_tag else None
img_tag = li.select_one(".article-img img")
image_url = img_tag["src"] if img_tag else None
images = []
if image_url:
images.append({
"image": image_url,
"link": url
})
entry = {
"date": date_str,
"identifier": identifier,
"type": None,
"timestamp": timestamp,
"headline": None,
"content": content,
"url": url,
"images": images,
'is_ai_summary': False
}
entries.append(entry)
return entries
|