diff options
| author | Pinapelz <yukais@pinapelz.com> | 2025-04-17 12:08:12 -0700 |
|---|---|---|
| committer | Pinapelz <yukais@pinapelz.com> | 2025-04-17 12:08:12 -0700 |
| commit | d6c2824bc67cfb708763fba9412c6610d9cd05bb (patch) | |
| tree | 8914d0f0ba6e8039fa3ac9a59856ac01c8645094 /konami | |
| parent | 9059bca2439f3c5f2f91e4bd8d97a0da184be393 (diff) | |
add generic parser for eamusement games
information is different and some is better suited for our format than others
Diffstat (limited to 'konami')
| -rw-r--r-- | konami/eamuse_app.py | 51 |
1 files changed, 51 insertions, 0 deletions
diff --git a/konami/eamuse_app.py b/konami/eamuse_app.py new file mode 100644 index 0000000..b552477 --- /dev/null +++ b/konami/eamuse_app.py @@ -0,0 +1,51 @@ +from bs4 import BeautifulSoup +from datetime import datetime +from urllib.parse import urljoin +import time + +BASE_URL = "https://eam.573.jp" + +def parse_news_page(html: str, identifier: str): + soup = BeautifulSoup(html, "html.parser") + entries = [] + + for li in soup.select("ul > li.ef"): + a_tag = li.find("a", href=True) + url = urljoin(BASE_URL, a_tag["href"]) if a_tag else None + + date_text = li.select_one(".post-date") + if not date_text: + continue + raw_date = date_text.get_text(strip=True).replace("年", "/").replace("月", "/").replace("日", "") + try: + date_obj = datetime.strptime(raw_date, "%Y/%m/%d") + except ValueError: + continue + date_str = date_obj.strftime("%Y-%m-%d") + timestamp = int(time.mktime(date_obj.timetuple())) + + content_tag = li.select_one(".article-text") + content = content_tag.get_text(strip=True) if content_tag else None + + img_tag = li.select_one(".article-img img") + image_url = img_tag["src"] if img_tag else None + images = [] + if image_url: + images.append({ + "image": image_url, + "link": url + }) + + entry = { + "date": date_str, + "identifier": identifier, + "type": None, + "timestamp": timestamp, + "headline": None, + "content": content, + "url": url, + "images": images + } + entries.append(entry) + + return entries |
