diff options
| author | Pinapelz <yukais@pinapelz.com> | 2025-04-22 17:36:13 -0700 |
|---|---|---|
| committer | Pinapelz <yukais@pinapelz.com> | 2025-04-22 17:36:13 -0700 |
| commit | 4583d95b06071b637cf6794f332b099d30b9a5c3 (patch) | |
| tree | 4df38a9c3d353e1cad4b54ca917b24e9af09881b /community | |
| parent | 7b5ae11e469c762ed8145cf74bd1c67d9002c52b (diff) | |
add support for mus_plus
Diffstat (limited to 'community')
| -rw-r--r-- | community/museca_plus.py | 41 |
1 files changed, 41 insertions, 0 deletions
diff --git a/community/museca_plus.py b/community/museca_plus.py new file mode 100644 index 0000000..81c7313 --- /dev/null +++ b/community/museca_plus.py @@ -0,0 +1,41 @@ +from bs4 import BeautifulSoup +from datetime import datetime +from urllib.parse import urljoin +import time +import re + +def parse_museca_plus_news_site(html: str) -> list: + soup = BeautifulSoup(html, "html.parser") + news_posts = [] + base_url = "https://museca.plus/" + for p in soup.select("div.subcontainer.center.text > p"): + text = p.get_text(strip=True, separator=' ') + date_match = re.search(r'(\d{4}-\d{2}-\d{2})', text) + if not date_match: + continue + date_str = date_match.group(1) + try: + dt = datetime.strptime(date_str, "%Y-%m-%d") + timestamp = int(time.mktime(dt.timetuple())) + except ValueError: + continue + images = [] + for img in p.find_all("img"): + img_url = urljoin(base_url, img.get("src")) + parent_a = img.find_parent("a") + images.append({"image": img_url, "link": None}) + + content = p.get_text(separator=' ', strip=True) + + news_posts.append({ + 'date': date_str, + 'identifier': 'MUSECA_PLUS', + 'type': None, + 'timestamp': timestamp, + 'headline': None, + 'content': content, + 'url': None, + 'images': images + }) + + return news_posts |
