From d6c2824bc67cfb708763fba9412c6610d9cd05bb Mon Sep 17 00:00:00 2001 From: Pinapelz Date: Thu, 17 Apr 2025 12:08:12 -0700 Subject: add generic parser for eamusement games information is different and some is better suited for our format than others --- konami/eamuse_app.py | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 konami/eamuse_app.py (limited to 'konami') diff --git a/konami/eamuse_app.py b/konami/eamuse_app.py new file mode 100644 index 0000000..b552477 --- /dev/null +++ b/konami/eamuse_app.py @@ -0,0 +1,51 @@ +from bs4 import BeautifulSoup +from datetime import datetime +from urllib.parse import urljoin +import time + +BASE_URL = "https://eam.573.jp" + +def parse_news_page(html: str, identifier: str): + soup = BeautifulSoup(html, "html.parser") + entries = [] + + for li in soup.select("ul > li.ef"): + a_tag = li.find("a", href=True) + url = urljoin(BASE_URL, a_tag["href"]) if a_tag else None + + date_text = li.select_one(".post-date") + if not date_text: + continue + raw_date = date_text.get_text(strip=True).replace("年", "/").replace("月", "/").replace("日", "") + try: + date_obj = datetime.strptime(raw_date, "%Y/%m/%d") + except ValueError: + continue + date_str = date_obj.strftime("%Y-%m-%d") + timestamp = int(time.mktime(date_obj.timetuple())) + + content_tag = li.select_one(".article-text") + content = content_tag.get_text(strip=True) if content_tag else None + + img_tag = li.select_one(".article-img img") + image_url = img_tag["src"] if img_tag else None + images = [] + if image_url: + images.append({ + "image": image_url, + "link": url + }) + + entry = { + "date": date_str, + "identifier": identifier, + "type": None, + "timestamp": timestamp, + "headline": None, + "content": content, + "url": url, + "images": images + } + entries.append(entry) + + return entries -- cgit v1.2.3