diff options
| author | Pinapelz <yukais@pinapelz.com> | 2025-04-15 01:37:45 -0700 |
|---|---|---|
| committer | Pinapelz <yukais@pinapelz.com> | 2025-04-15 01:37:45 -0700 |
| commit | db96364d2301c79a05998be0fcf27e6013517b22 (patch) | |
| tree | 433142407b3bba552c510ec4431b1a7ee5684c7b | |
| parent | c9d2521b7beb5b5d1077565ec968cb3421497417 (diff) | |
optimization: pull site using requests for those that don't need JS
| -rw-r--r-- | news_feed.py | 22 | ||||
| -rw-r--r-- | site_scraper.py | 24 |
2 files changed, 42 insertions, 4 deletions
diff --git a/news_feed.py b/news_feed.py index 602b5a6..e7c1a4a 100644 --- a/news_feed.py +++ b/news_feed.py @@ -17,7 +17,7 @@ Generic format for a news entry. All keys are considered to be nullable """ from email.utils import parsedate_to_datetime -from site_scraper import SiteScraper +from site_scraper import SiteScraper, download_site_as_html import bemani.sdvx as sound_voltex import bemani.iidx as iidx import sega.chuni_jp as chunithm_jp @@ -28,28 +28,42 @@ import sega.ongeki_jp as ongeki_jp import constants def get_news(news_url: str, version=None) -> list: - scraper = SiteScraper(headless=True) - site_data = scraper.get_page_source(news_url) + # As of right now all supported games don't require JS to pull data from + # scraper = SiteScraper(headless=True) + # site_data = scraper.get_page_source(news_url) if news_url == constants.SOUND_VOLTEX_EXCEED_GEAR_NEWS_SITE: + site_data = download_site_as_html(news_url) news_posts = sorted(sound_voltex.parse_exceed_gear_news_site(site_data, constants.EAMUSEMENT_BASE_URL), key=lambda x: x['timestamp'], reverse=True) + elif news_url == constants.IIDX_PINKY_CRUSH_NEWS_SITE: + site_data = download_site_as_html(news_url) news_posts = sorted(iidx.parse_pinky_crush_news_site(site_data, constants.EAMUSEMENT_BASE_URL), key=lambda x: x['timestamp'], reverse=True) + elif news_url == constants.CHUNITHM_JP_NEWS_SITE: + site_data = download_site_as_html(news_url) if version == constants.CHUNITHM_VERSION.VERSE: news_posts = sorted(chunithm_jp.parse_chuni_jp_verse_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) + elif news_url == constants.CHUNITHM_INTL_NEWS_SITE: + site_data = download_site_as_html(news_url) if version == constants.CHUNITHM_VERSION.LUMINOUS_PLUS: news_posts = sorted(chuni_intl.parse_chuni_intl_luminous_plus_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) + elif news_url == constants.MAIMAIDX_JP_NEWS_SITE: + site_data = download_site_as_html(news_url) if version == constants.MAIMAIDX_VERSION.PRISM_PLUS: news_posts = sorted(maimaidx_jp.parse_maimaidx_jp_prism_plus_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) + elif news_url == constants.MAIMAIDX_INTL_NEWS_SITE: + site_data = download_site_as_html(news_url) if version == constants.MAIMAIDX_VERSION.PRISM: news_posts = sorted(maimaidx_intl.parse_maimaidx_intl_prism_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) + elif news_url == constants.ONGEKI_JP_NEWS_SITE: + site_data = download_site_as_html(news_url) if version == constants.ONGEKI_VERSION.REFRESH: news_posts = sorted(ongeki_jp.parse_ongeki_refresh_news_site(site_data), key=lambda x: x['timestamp'], reverse=True) else: news_posts = [] - scraper.close() + # scraper.close() return news_posts diff --git a/site_scraper.py b/site_scraper.py index f801c20..9efa4b6 100644 --- a/site_scraper.py +++ b/site_scraper.py @@ -2,6 +2,7 @@ from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options as ChromeOptions from selenium import webdriver from dotenv import load_dotenv +import requests import time import os @@ -65,3 +66,26 @@ class SiteScraper: self.driver.quit() self.service.stop() print("WebDriver closed successfully") + + +def download_site_as_html(url: str, timeout: int = 10) -> str: + headers = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/122.0.0.0 Safari/537.36" + ), + "Accept": ( + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8" + ), + "Accept-Language": "en-US,en;q=0.9", + "Connection": "keep-alive", + } + + try: + response = requests.get(url, headers=headers, timeout=timeout) + response.raise_for_status() + return response.text + except requests.RequestException as e: + print(f"Error downloading {url}: {e}") + return "" |
