aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--news_feed.py22
-rw-r--r--site_scraper.py24
2 files changed, 42 insertions, 4 deletions
diff --git a/news_feed.py b/news_feed.py
index 602b5a6..e7c1a4a 100644
--- a/news_feed.py
+++ b/news_feed.py
@@ -17,7 +17,7 @@ Generic format for a news entry. All keys are considered to be nullable
"""
from email.utils import parsedate_to_datetime
-from site_scraper import SiteScraper
+from site_scraper import SiteScraper, download_site_as_html
import bemani.sdvx as sound_voltex
import bemani.iidx as iidx
import sega.chuni_jp as chunithm_jp
@@ -28,28 +28,42 @@ import sega.ongeki_jp as ongeki_jp
import constants
def get_news(news_url: str, version=None) -> list:
- scraper = SiteScraper(headless=True)
- site_data = scraper.get_page_source(news_url)
+ # As of right now all supported games don't require JS to pull data from
+ # scraper = SiteScraper(headless=True)
+ # site_data = scraper.get_page_source(news_url)
if news_url == constants.SOUND_VOLTEX_EXCEED_GEAR_NEWS_SITE:
+ site_data = download_site_as_html(news_url)
news_posts = sorted(sound_voltex.parse_exceed_gear_news_site(site_data, constants.EAMUSEMENT_BASE_URL), key=lambda x: x['timestamp'], reverse=True)
+
elif news_url == constants.IIDX_PINKY_CRUSH_NEWS_SITE:
+ site_data = download_site_as_html(news_url)
news_posts = sorted(iidx.parse_pinky_crush_news_site(site_data, constants.EAMUSEMENT_BASE_URL), key=lambda x: x['timestamp'], reverse=True)
+
elif news_url == constants.CHUNITHM_JP_NEWS_SITE:
+ site_data = download_site_as_html(news_url)
if version == constants.CHUNITHM_VERSION.VERSE:
news_posts = sorted(chunithm_jp.parse_chuni_jp_verse_news_site(site_data), key=lambda x: x['timestamp'], reverse=True)
+
elif news_url == constants.CHUNITHM_INTL_NEWS_SITE:
+ site_data = download_site_as_html(news_url)
if version == constants.CHUNITHM_VERSION.LUMINOUS_PLUS:
news_posts = sorted(chuni_intl.parse_chuni_intl_luminous_plus_news_site(site_data), key=lambda x: x['timestamp'], reverse=True)
+
elif news_url == constants.MAIMAIDX_JP_NEWS_SITE:
+ site_data = download_site_as_html(news_url)
if version == constants.MAIMAIDX_VERSION.PRISM_PLUS:
news_posts = sorted(maimaidx_jp.parse_maimaidx_jp_prism_plus_news_site(site_data), key=lambda x: x['timestamp'], reverse=True)
+
elif news_url == constants.MAIMAIDX_INTL_NEWS_SITE:
+ site_data = download_site_as_html(news_url)
if version == constants.MAIMAIDX_VERSION.PRISM:
news_posts = sorted(maimaidx_intl.parse_maimaidx_intl_prism_news_site(site_data), key=lambda x: x['timestamp'], reverse=True)
+
elif news_url == constants.ONGEKI_JP_NEWS_SITE:
+ site_data = download_site_as_html(news_url)
if version == constants.ONGEKI_VERSION.REFRESH:
news_posts = sorted(ongeki_jp.parse_ongeki_refresh_news_site(site_data), key=lambda x: x['timestamp'], reverse=True)
else:
news_posts = []
- scraper.close()
+ # scraper.close()
return news_posts
diff --git a/site_scraper.py b/site_scraper.py
index f801c20..9efa4b6 100644
--- a/site_scraper.py
+++ b/site_scraper.py
@@ -2,6 +2,7 @@ from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium import webdriver
from dotenv import load_dotenv
+import requests
import time
import os
@@ -65,3 +66,26 @@ class SiteScraper:
self.driver.quit()
self.service.stop()
print("WebDriver closed successfully")
+
+
+def download_site_as_html(url: str, timeout: int = 10) -> str:
+ headers = {
+ "User-Agent": (
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
+ "Chrome/122.0.0.0 Safari/537.36"
+ ),
+ "Accept": (
+ "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"
+ ),
+ "Accept-Language": "en-US,en;q=0.9",
+ "Connection": "keep-alive",
+ }
+
+ try:
+ response = requests.get(url, headers=headers, timeout=timeout)
+ response.raise_for_status()
+ return response.text
+ except requests.RequestException as e:
+ print(f"Error downloading {url}: {e}")
+ return ""
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage