1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
|
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import urljoin
import json
import time
BASE_URL = "https://eam.573.jp"
def parse_news_page(html: str, identifier: str):
"""
Legacy method of scraping. Should not be used if API method works since it will be much faster
"""
soup = BeautifulSoup(html, "html.parser")
entries = []
for li in soup.select("ul > li.ef"):
a_tag = li.find("a", href=True)
url = urljoin(BASE_URL, a_tag["href"]) if a_tag else None
date_text = li.select_one(".post-date")
if not date_text:
continue
raw_date = date_text.get_text(strip=True).replace("年", "/").replace("月", "/").replace("日", "")
try:
date_obj = datetime.strptime(raw_date, "%Y/%m/%d")
except ValueError:
continue
date_str = date_obj.strftime("%Y-%m-%d")
timestamp = int(time.mktime(date_obj.timetuple()))
content_tag = li.select_one(".article-text")
content = content_tag.get_text(strip=True) if content_tag else None
img_tag = li.select_one(".article-img img")
image_url = img_tag["src"] if img_tag else None
images = []
if image_url:
images.append({
"image": image_url,
"link": url
})
entry = {
"date": date_str,
"identifier": identifier,
"type": None,
"timestamp": timestamp,
"headline": None,
"content": content,
"url": url,
"images": images,
'is_ai_summary': False
}
entries.append(entry)
return entries
def parse_news_api_route(raw_api_data: str, identifier: str, eam_news_site: str=""):
"""
Re-maps eamuse news app API routes to 573Updates JSON
"""
route_data = json.loads(raw_api_data)
entries = []
for post_data in route_data["post_list"]:
date_str = post_data["entry_date"]
timestamp = post_data["entry_time"]
content = post_data["content"]
url = eam_news_site + "?post_id="+post_data["post_id"]
images = []
if "image_url" in post_data:
images = [{
"image": post_data["image_url"],
"link": url
}]
entry = {
"date": date_str,
"identifier": identifier,
"type": None,
"timestamp": timestamp,
"headline": None,
"content": content,
"url": url,
"images": images,
"is_ai_summary": False
}
entries.append(entry)
return entries
|