diff options
| author | Pinapelz <yukais@pinapelz.com> | 2025-06-16 19:06:45 -0700 |
|---|---|---|
| committer | Pinapelz <yukais@pinapelz.com> | 2025-06-16 19:06:45 -0700 |
| commit | 37a18a1e699567115ec038c04e3789f41b38e525 (patch) | |
| tree | b911b94d57f5e88e998ade70f842377d0d4e77b8 | |
| parent | c32042f7b40484e810456fc4d678f4f9c08f43b8 (diff) | |
implement idac news scrape
| -rw-r--r-- | constants.py | 1 | ||||
| -rw-r--r-- | generate.py | 5 | ||||
| -rw-r--r-- | news_feed.py | 10 | ||||
| -rw-r--r-- | sega/idac.py | 103 |
4 files changed, 117 insertions, 2 deletions
diff --git a/constants.py b/constants.py index e1b0369..c252eab 100644 --- a/constants.py +++ b/constants.py @@ -23,6 +23,7 @@ CHUNITHM_INTL_NEWS_SITE="https://info-chunithm.sega.com/" MAIMAIDX_JP_NEWS_SITE="https://info-maimai.sega.jp/" MAIMAIDX_INTL_NEWS_SITE="https://maimai.sega.com/download/" ONGEKI_JP_NEWS_SITE="https://info-ongeki.sega.jp/" +IDAC_NEWS_SITE="https://info-initialdac.sega.jp/" MUSIC_DIVER_NEWS="https://mypage.musicdiver.jp/api/news?lang=en" STREET_FIGHTER_NEWS_SITE="https://sf6ta.jp/info/list" diff --git a/generate.py b/generate.py index e94384e..900777d 100644 --- a/generate.py +++ b/generate.py @@ -122,6 +122,9 @@ def generate_maimaidx_intl_news_file(): def generate_chunithm_intl_news_file(): return generate_news_file("chunithm_intl_news", constants.CHUNITHM_INTL_NEWS_SITE, constants.CHUNITHM_VERSION.VERSE) +def generate_idac_news_file(): + return generate_news_file("idac_news", constants.IDAC_NEWS_SITE) + def generate_music_diver_news_file(): return generate_news_file("music_diver_news", constants.MUSIC_DIVER_NEWS) @@ -161,6 +164,7 @@ if __name__ == "__main__": chunithm_jp_news_data = generate_chunithm_jp_news_file() maimaidx_jp_news_data = generate_maimaidx_jp_news_file() ongeki_jp_news_data = generate_ongeki_jp_news_file() + idac_news_data = generate_idac_news_file() maimaidx_intl_news_data = generate_maimaidx_intl_news_file() chunithm_intl_news_data = generate_chunithm_intl_news_file() music_diver_news_data = generate_music_diver_news_file() @@ -184,6 +188,7 @@ if __name__ == "__main__": chunithm_jp_news_data, maimaidx_jp_news_data, ongeki_jp_news_data, + idac_news_data, maimaidx_intl_news_data, chunithm_intl_news_data, music_diver_news_data, diff --git a/news_feed.py b/news_feed.py index 7c781db..6bd5116 100644 --- a/news_feed.py +++ b/news_feed.py @@ -30,6 +30,7 @@ import sega.chuni_intl as chuni_intl import sega.maimaidx_jp as maimaidx_jp import sega.maimaidx_intl as maimaidx_intl import sega.ongeki_jp as ongeki_jp +import sega.idac as idac import taito.music_diver as music_diver import taito.street_fighter as street_fighter import bandai_namco.taiko as taiko @@ -159,6 +160,13 @@ def get_news(news_url: str, version=None) -> list: news_posts = sorted(ongeki_jp.parse_ongeki_refresh_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) news_posts = translate.add_translate_text_to_en(news_posts) + elif news_url == constants.IDAC_NEWS_SITE: + site_data = download_site_as_html(news_url) + news_posts = sorted(idac.parse_idac_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) + for news in news_posts: + promo_image_url = idac.get_promo_image(download_site_as_html(news["url"])) + news["images"] = [{'image': promo_image_url, 'link': None}] + elif news_url == constants.MUSIC_DIVER_NEWS: api_data = download_site_as_html(news_url) news_posts = sorted(music_diver.parse_music_diver_news_json(api_data), key=lambda x: x['timestamp'], reverse=True) @@ -166,8 +174,6 @@ def get_news(news_url: str, version=None) -> list: elif news_url == constants.STREET_FIGHTER_NEWS_SITE: site_data = download_site_as_html(news_url) news_posts = sorted(street_fighter.parse_sf_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) - print(news_posts) - exit() elif news_url == constants.TAIKO_BLOG_SITE: diff --git a/sega/idac.py b/sega/idac.py new file mode 100644 index 0000000..3b8a444 --- /dev/null +++ b/sega/idac.py @@ -0,0 +1,103 @@ +import json +from bs4 import BeautifulSoup +import re +from datetime import datetime +from urllib.parse import urljoin +from constants import IDAC_NEWS_SITE + + +def parse_idac_news_site(site_data: str): + soup = BeautifulSoup(site_data, "html.parser") + news_entries = [] + articles = soup.find_all('article', class_=lambda x: x and 'post-' in x) + for article in articles: + try: + post_id = None + for cls in article.get('class', []): + if cls.startswith('post-') and cls[5:].isdigit(): + post_id = cls[5:] + break + + if not post_id: + continue + title_section = article.find('h1', class_='entry-title') + if not title_section: + continue + news_title_link = title_section.find('a', class_='news-title') + if not news_title_link: + continue + + url = news_title_link.get('href', '') + headline = news_title_link.get_text(strip=True) + date_span = title_section.find('span', class_='entry_date') + if not date_span: + continue + + date_text = date_span.get_text(strip=True) + + date_match = re.match(r'(\d{4})年(\d{1,2})月(\d{1,2})日', date_text) + if not date_match: + continue + + year = int(date_match.group(1)) + month = int(date_match.group(2)) + day = int(date_match.group(3)) + + # Create datetime object (assuming JST timezone, noon time) + try: + post_date = datetime(year, month, day, 12, 0) + timestamp = int(post_date.timestamp()) + except ValueError: + continue + post_type = None + categories_list = title_section.find('ul', class_='post-categories') + if categories_list: + category_link = categories_list.find('a') + if category_link: + post_type = category_link.get_text(strip=True) + content = "" + entry_summary = article.find('div', class_='entry-summary') + if entry_summary: + content = entry_summary.get_text(strip=True) + content = re.sub(r'続きを読む\s*.*$', '', content).strip() + content = re.sub(r'\s*…\s*$', '', content).strip() + images = [] + img_tags = article.find_all('img') + for img in img_tags: + img_src = img.get('src', '') + if img_src and not img_src.endswith('.svg'): # Skip icon/UI images + if img_src.startswith('/'): + img_src = urljoin('https://info-initialdac.sega.jp', img_src) + images.append({ + 'image': img_src, + 'link': url + }) + news_entry = { + 'date': post_date.strftime("%Y-%m-%d %H:%M"), + 'identifier': "IDAC_NEWS", + 'type': post_type, + 'timestamp': timestamp, + 'headline': headline, + 'content': content if content else headline, + 'url': url, + 'images': images, + 'is_ai_summary': False + } + + news_entries.append(news_entry) + + except Exception as e: + # Skip malformed entries + continue + + return news_entries + + +def get_promo_image(site_data: str) -> str: + soup = BeautifulSoup(site_data, "html.parser") + entry_content = soup.find('div', class_='entry-content') + if entry_content: + img_tag = entry_content.find('img') + if img_tag: + return img_tag.get('src', '') + return '' |
