diff options
Diffstat (limited to 'bemani/sdvx.py')
| -rw-r--r-- | bemani/sdvx.py | 76 |
1 files changed, 76 insertions, 0 deletions
diff --git a/bemani/sdvx.py b/bemani/sdvx.py index 9d5a33b..5a7d25c 100644 --- a/bemani/sdvx.py +++ b/bemani/sdvx.py @@ -49,3 +49,79 @@ def parse_exceed_gear_news_site(html: str): }) return entries + +def parse_nabla_news_site(html: str): + base_url = "https://p.eagate.573.jp" + soup = BeautifulSoup(html, 'html.parser') + news_list = soup.select('#news-inner ul.news li') + + entries = [] + for li in news_list: + strong_tags = li.select('strong') + if not strong_tags: + continue + + date = strong_tags[0] + date_str = date.text.strip() + try: + dt = datetime.strptime(date_str, "%Y.%m.%d") + timestamp = int(dt.timestamp()) + except ValueError: + timestamp = None + + headline_text = None + if len(strong_tags) > 1: + headline_text = strong_tags[1].text.strip() + + for tag in li.select('font, b, u, span'): + tag.unwrap() + + content_parts = [] + for node in li.contents: + if hasattr(node, 'name'): + if node.name == 'strong': + continue + elif node.name == 'br': + content_parts.append('\n') + elif node.name == 'a' and 'link-text' in node.get('class', []): + content_parts.append(node.text.strip()) + elif node.name not in ['img']: # Skip image tags for content + content_parts.append(node.get_text(strip=True)) + else: + text = str(node).strip() + if text and text not in [date_str, headline_text]: + content_parts.append(text) + + content = '\n'.join(filter(None, content_parts)).strip() + + images = [] + for img in li.select('img'): + src = img.get('data-original') or img.get('src') + if not src or (isinstance(src, str) and src.startswith('data:')): + continue + if isinstance(src, str): + src = urljoin(base_url, src) + parent = img.find_parent('a') + href = None + if parent and hasattr(parent, 'get') and parent.get('href'): + href_val = parent.get('href') + if isinstance(href_val, str): + href = urljoin(base_url, href_val) + + image_entry = {'image': src, 'link': href} + if image_entry not in images: + images.append(image_entry) + + entries.append({ + 'date': date_str, + 'identifier': 'SOUND_VOLTEX', + 'type': None, + 'timestamp': timestamp, + 'headline': headline_text, + 'content': content, + "url": None, + 'images': images, + 'is_ai_summary': False + }) + + return entries |
