aboutsummaryrefslogtreecommitdiffstats
path: root/bemani
diff options
context:
space:
mode:
authorPinapelz <yukais@pinapelz.com>2025-10-16 11:26:27 -0700
committerPinapelz <yukais@pinapelz.com>2025-10-16 11:26:27 -0700
commitbcdec514d4823031b39d5c726b371a5bfbefd240 (patch)
tree40e1056bcb1d4572ee3caedb67db6080a0af4cb6 /bemani
parent9aeca20449d6b4083d53b510ef8ab819bde43846 (diff)
fix: change polaris chord to scrape skeleton html, add date validation
Diffstat (limited to 'bemani')
-rw-r--r--bemani/polaris_chord.py79
1 files changed, 52 insertions, 27 deletions
diff --git a/bemani/polaris_chord.py b/bemani/polaris_chord.py
index 3b13236..b4c96cd 100644
--- a/bemani/polaris_chord.py
+++ b/bemani/polaris_chord.py
@@ -11,54 +11,79 @@ CATEGORY_MAP = {
"i_04": "OTHER"
}
-
def parse_polaris_chord_news_site(html: str) -> list[dict]:
- base_url = "https://p.eagate.573.jp/"
+ base_url = "https://eacache.s.konaminet.jp/game/polarischord/pc/"
soup = BeautifulSoup(html, 'html.parser')
news_list = []
- for li in soup.select('#info-news li.news'):
- raw_date = li.find('li', class_='news_date').text.strip()
- match = re.search(r'(\d{4}/\d{1,2}/\d{1,2})', raw_date)
- if not match:
+
+ for li in soup.select('li.news'):
+ raw_type = li.get('data-category')
+ post_type = CATEGORY_MAP.get(raw_type, "OTHER")
+
+ # Attempt to extract date from data-date (format: 20251015-01)
+ raw_date = li.get('data-date', '')
+ # Extract first 8 digits: YYYYMMDD
+ if len(raw_date) < 8:
+ continue
+ date_part = raw_date[:8]
+ date_match = re.match(r'(\d{4})(\d{2})(\d{2})', date_part)
+ if not date_match:
+ continue
+ year, month, day = map(int, date_match.groups())
+ if month < 1 or month > 12:
+ continue
+ if day < 1 or day > 31:
continue
- date_str = match.group(1)
+ if month in [4, 6, 9, 11] and day > 30:
+ continue
+ if month == 2 and day > 29:
+ continue
+
+ date_str = f"{year}/{month:02}/{day:02}"
try:
- dt = datetime.strptime(date_str, '%Y/%m/%d')
- except ValueError:
+ jst = pytz.timezone('Asia/Tokyo')
+ dt_jst = jst.localize(datetime(year, month, day))
+ timestamp = int(dt_jst.timestamp())
+ except (ValueError, OverflowError):
+ # Skip if datetime creation fails
continue
- jst = pytz.timezone('Asia/Tokyo')
- dt_jst = jst.localize(dt)
- timestamp = int(dt_jst.timestamp())
- raw_type = li.get('data-category')
- post_type = CATEGORY_MAP.get(raw_type)
+ # Find the news-main ul inside the li
+ news_main = li.find('ul', class_='news-main')
+ if not news_main:
+ continue
- headline = li.find('li', class_='news_title').text.strip()
- detail = li.find('li', class_='news_detail')
- content = detail.get_text().strip()
+ # Extract headline from news_title li
+ headline_li = news_main.find('li', class_='news_title')
+ headline_text = headline_li.get_text(strip=True) if headline_li else None
- first_a = detail.find('a', href=True)
- url = urljoin(base_url, first_a['href']) if first_a else None
+ # Extract content from news_detail li
+ detail_li = news_main.find('li', class_='news_detail')
+ content = detail_li.get_text(strip=True) if detail_li else None
+ # Find all images in the detail section
images = []
- for img in detail.find_all('img'):
- img_url = img.get('src')
- link = None
- if img.parent.name == 'a' and img.parent.has_attr('href'):
- link = img.parent['href']
- images.append({'image': img_url, 'link': link})
+ if detail_li:
+ for img in detail_li.find_all('img'):
+ # Check both src and data-src (for lazy loaded images)
+ img_url = img.get('src') or img.get('data-src')
+ if img_url and not img_url.startswith('http'):
+ img_url = urljoin(base_url, img_url)
+ if img_url:
+ images.append({'image': img_url, 'link': None})
entry = {
'date': date_str,
'identifier': "POLARIS_CHORD",
'type': post_type,
'timestamp': timestamp,
- 'headline': headline,
+ 'headline': headline_text,
'content': content,
- 'url': url,
+ 'url': None,
'images': images,
'is_ai_summary': False,
}
news_list.append(entry)
+
return news_list
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage