implement idac news scrape

author: Pinapelz <yukais@pinapelz.com> 2025-06-16 19:06:45 -0700
committer: Pinapelz <yukais@pinapelz.com> 2025-06-16 19:06:45 -0700
commit: 37a18a1e699567115ec038c04e3789f41b38e525 (patch)
tree: b911b94d57f5e88e998ade70f842377d0d4e77b8 /sega
parent: c32042f7b40484e810456fc4d678f4f9c08f43b8 (diff)
1 files changed, 103 insertions, 0 deletions
diff --git a/sega/idac.py b/sega/idac.py
new file mode 100644
index 0000000..3b8a444
--- /dev/null
+++ b/sega/idac.py
@@ -0,0 +1,103 @@
+import json
+from bs4 import BeautifulSoup
+import re
+from datetime import datetime
+from urllib.parse import urljoin
+from constants import IDAC_NEWS_SITE
+
+
+def parse_idac_news_site(site_data: str):
+    soup = BeautifulSoup(site_data, "html.parser")
+    news_entries = []
+    articles = soup.find_all('article', class_=lambda x: x and 'post-' in x)
+    for article in articles:
+        try:
+            post_id = None
+            for cls in article.get('class', []):
+                if cls.startswith('post-') and cls[5:].isdigit():
+                    post_id = cls[5:]
+                    break
+
+            if not post_id:
+                continue
+            title_section = article.find('h1', class_='entry-title')
+            if not title_section:
+                continue
+            news_title_link = title_section.find('a', class_='news-title')
+            if not news_title_link:
+                continue
+
+            url = news_title_link.get('href', '')
+            headline = news_title_link.get_text(strip=True)
+            date_span = title_section.find('span', class_='entry_date')
+            if not date_span:
+                continue
+
+            date_text = date_span.get_text(strip=True)
+
+            date_match = re.match(r'(\d{4})年(\d{1,2})月(\d{1,2})日', date_text)
+            if not date_match:
+                continue
+
+            year = int(date_match.group(1))
+            month = int(date_match.group(2))
+            day = int(date_match.group(3))
+
+            # Create datetime object (assuming JST timezone, noon time)
+            try:
+                post_date = datetime(year, month, day, 12, 0)
+                timestamp = int(post_date.timestamp())
+            except ValueError:
+                continue
+            post_type = None
+            categories_list = title_section.find('ul', class_='post-categories')
+            if categories_list:
+                category_link = categories_list.find('a')
+                if category_link:
+                    post_type = category_link.get_text(strip=True)
+            content = ""
+            entry_summary = article.find('div', class_='entry-summary')
+            if entry_summary:
+                content = entry_summary.get_text(strip=True)
+                content = re.sub(r'続きを読む\s*.*$', '', content).strip()
+                content = re.sub(r'\s*…\s*$', '', content).strip()
+            images = []
+            img_tags = article.find_all('img')
+            for img in img_tags:
+                img_src = img.get('src', '')
+                if img_src and not img_src.endswith('.svg'):  # Skip icon/UI images
+                    if img_src.startswith('/'):
+                        img_src = urljoin('https://info-initialdac.sega.jp', img_src)
+                    images.append({
+                        'image': img_src,
+                        'link': url
+                    })
+            news_entry = {
+                'date': post_date.strftime("%Y-%m-%d %H:%M"),
+                'identifier': "IDAC_NEWS",
+                'type': post_type,
+                'timestamp': timestamp,
+                'headline': headline,
+                'content': content if content else headline,
+                'url': url,
+                'images': images,
+                'is_ai_summary': False
+            }
+
+            news_entries.append(news_entry)
+
+        except Exception as e:
+            # Skip malformed entries
+            continue
+
+    return news_entries
+
+
+def get_promo_image(site_data: str) -> str:
+    soup = BeautifulSoup(site_data, "html.parser")
+    entry_content = soup.find('div', class_='entry-content')
+    if entry_content:
+        img_tag = entry_content.find('img')
+        if img_tag:
+            return img_tag.get('src', '')
+    return ''
author	Pinapelz <yukais@pinapelz.com>	2025-06-16 19:06:45 -0700
committer	Pinapelz <yukais@pinapelz.com>	2025-06-16 19:06:45 -0700
commit	37a18a1e699567115ec038c04e3789f41b38e525 (patch)
tree	b911b94d57f5e88e998ade70f842377d0d4e77b8 /sega
parent	c32042f7b40484e810456fc4d678f4f9c08f43b8 (diff)