diff options
| author | Pinapelz <yukais@pinapelz.com> | 2025-04-13 18:00:28 -0700 |
|---|---|---|
| committer | Pinapelz <yukais@pinapelz.com> | 2025-04-13 18:00:28 -0700 |
| commit | 18a76b5bd09df3b1f34208bed4663939ca8f3f18 (patch) | |
| tree | 4bb2de9563a6ecf13816825493199beb1b8e9eb4 /sega | |
| parent | 03bc3271e0719e33c9517180bb2d39b0d73b7b90 (diff) | |
add CHUNITHM JPN scraper
Diffstat (limited to 'sega')
| -rw-r--r-- | sega/chuni_jp.py | 60 |
1 files changed, 60 insertions, 0 deletions
diff --git a/sega/chuni_jp.py b/sega/chuni_jp.py new file mode 100644 index 0000000..df727d4 --- /dev/null +++ b/sega/chuni_jp.py @@ -0,0 +1,60 @@ +from bs4 import BeautifulSoup +from datetime import datetime, timezone, timedelta +from urllib.parse import urljoin +import re + +def parse_chuni_jp_verse_news_site(html: str): + soup = BeautifulSoup(html, "html.parser") + news_entries = [] + news_wrapper = soup.find("div", class_="newsMainWrapper-left") + if not news_wrapper: + return news_entries + for a_tag in news_wrapper.find_all("a", href=True): + if not a_tag.find("div", class_="chuniCommonBox-inner"): + continue + news_dict = {} + news_url = a_tag.get("href") + news_dict["url"] = news_url + date_container = a_tag.find("div", class_="chuniCommonBox-inner-title") + date_str = None + if date_container: + title_span = date_container.find("span", class_="title") + if title_span: + text = title_span.get_text(strip=True) + date_match = re.search(r"(\d{4}\.\d{2}\.\d{2})", text) + if date_match: + date_str = date_match.group(1) + news_dict["date"] = date_str + news_dict["type"] = None + timestamp = None + if date_str: + try: + dt = datetime.strptime(date_str, "%Y.%m.%d") + dt = dt.replace(tzinfo=timezone(timedelta(hours=9))) + timestamp = int(dt.timestamp()) + except Exception: + timestamp = None + news_dict["timestamp"] = timestamp + main_content = a_tag.find("div", class_="chuniCommonBox-inner-main") + headline = None + content_text = "" + if main_content: + img_tag = main_content.find("img") + if img_tag and img_tag.get("alt"): + headline = img_tag.get("alt") + else: + headline = main_content.get_text(separator=" ", strip=True) + content_text = main_content.get_text(separator=" ", strip=True) + news_dict["headline"] = headline + news_dict["content"] = content_text + images = {"image": None, "link": None} + if main_content: + img_tag = main_content.find("img") + if img_tag: + images["image"] = img_tag.get("src") + images["link"] = news_url + news_dict["images"] = images + + news_entries.append(news_dict) + + return news_entries |
