aboutsummaryrefslogtreecommitdiffstats
path: root/sega
diff options
context:
space:
mode:
authorPinapelz <yukais@pinapelz.com>2025-06-16 19:06:45 -0700
committerPinapelz <yukais@pinapelz.com>2025-06-16 19:06:45 -0700
commit37a18a1e699567115ec038c04e3789f41b38e525 (patch)
treeb911b94d57f5e88e998ade70f842377d0d4e77b8 /sega
parentc32042f7b40484e810456fc4d678f4f9c08f43b8 (diff)
implement idac news scrape
Diffstat (limited to 'sega')
-rw-r--r--sega/idac.py103
1 files changed, 103 insertions, 0 deletions
diff --git a/sega/idac.py b/sega/idac.py
new file mode 100644
index 0000000..3b8a444
--- /dev/null
+++ b/sega/idac.py
@@ -0,0 +1,103 @@
+import json
+from bs4 import BeautifulSoup
+import re
+from datetime import datetime
+from urllib.parse import urljoin
+from constants import IDAC_NEWS_SITE
+
+
+def parse_idac_news_site(site_data: str):
+ soup = BeautifulSoup(site_data, "html.parser")
+ news_entries = []
+ articles = soup.find_all('article', class_=lambda x: x and 'post-' in x)
+ for article in articles:
+ try:
+ post_id = None
+ for cls in article.get('class', []):
+ if cls.startswith('post-') and cls[5:].isdigit():
+ post_id = cls[5:]
+ break
+
+ if not post_id:
+ continue
+ title_section = article.find('h1', class_='entry-title')
+ if not title_section:
+ continue
+ news_title_link = title_section.find('a', class_='news-title')
+ if not news_title_link:
+ continue
+
+ url = news_title_link.get('href', '')
+ headline = news_title_link.get_text(strip=True)
+ date_span = title_section.find('span', class_='entry_date')
+ if not date_span:
+ continue
+
+ date_text = date_span.get_text(strip=True)
+
+ date_match = re.match(r'(\d{4})年(\d{1,2})月(\d{1,2})日', date_text)
+ if not date_match:
+ continue
+
+ year = int(date_match.group(1))
+ month = int(date_match.group(2))
+ day = int(date_match.group(3))
+
+ # Create datetime object (assuming JST timezone, noon time)
+ try:
+ post_date = datetime(year, month, day, 12, 0)
+ timestamp = int(post_date.timestamp())
+ except ValueError:
+ continue
+ post_type = None
+ categories_list = title_section.find('ul', class_='post-categories')
+ if categories_list:
+ category_link = categories_list.find('a')
+ if category_link:
+ post_type = category_link.get_text(strip=True)
+ content = ""
+ entry_summary = article.find('div', class_='entry-summary')
+ if entry_summary:
+ content = entry_summary.get_text(strip=True)
+ content = re.sub(r'続きを読む\s*.*$', '', content).strip()
+ content = re.sub(r'\s*…\s*$', '', content).strip()
+ images = []
+ img_tags = article.find_all('img')
+ for img in img_tags:
+ img_src = img.get('src', '')
+ if img_src and not img_src.endswith('.svg'): # Skip icon/UI images
+ if img_src.startswith('/'):
+ img_src = urljoin('https://info-initialdac.sega.jp', img_src)
+ images.append({
+ 'image': img_src,
+ 'link': url
+ })
+ news_entry = {
+ 'date': post_date.strftime("%Y-%m-%d %H:%M"),
+ 'identifier': "IDAC_NEWS",
+ 'type': post_type,
+ 'timestamp': timestamp,
+ 'headline': headline,
+ 'content': content if content else headline,
+ 'url': url,
+ 'images': images,
+ 'is_ai_summary': False
+ }
+
+ news_entries.append(news_entry)
+
+ except Exception as e:
+ # Skip malformed entries
+ continue
+
+ return news_entries
+
+
+def get_promo_image(site_data: str) -> str:
+ soup = BeautifulSoup(site_data, "html.parser")
+ entry_content = soup.find('div', class_='entry-content')
+ if entry_content:
+ img_tag = entry_content.find('img')
+ if img_tag:
+ return img_tag.get('src', '')
+ return ''
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage