diff options
| author | Pinapelz <yukais@pinapelz.com> | 2025-04-15 01:37:45 -0700 |
|---|---|---|
| committer | Pinapelz <yukais@pinapelz.com> | 2025-04-15 01:37:45 -0700 |
| commit | db96364d2301c79a05998be0fcf27e6013517b22 (patch) | |
| tree | 433142407b3bba552c510ec4431b1a7ee5684c7b /site_scraper.py | |
| parent | c9d2521b7beb5b5d1077565ec968cb3421497417 (diff) | |
optimization: pull site using requests for those that don't need JS
Diffstat (limited to 'site_scraper.py')
| -rw-r--r-- | site_scraper.py | 24 |
1 files changed, 24 insertions, 0 deletions
diff --git a/site_scraper.py b/site_scraper.py index f801c20..9efa4b6 100644 --- a/site_scraper.py +++ b/site_scraper.py @@ -2,6 +2,7 @@ from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options as ChromeOptions from selenium import webdriver from dotenv import load_dotenv +import requests import time import os @@ -65,3 +66,26 @@ class SiteScraper: self.driver.quit() self.service.stop() print("WebDriver closed successfully") + + +def download_site_as_html(url: str, timeout: int = 10) -> str: + headers = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/122.0.0.0 Safari/537.36" + ), + "Accept": ( + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8" + ), + "Accept-Language": "en-US,en;q=0.9", + "Connection": "keep-alive", + } + + try: + response = requests.get(url, headers=headers, timeout=timeout) + response.raise_for_status() + return response.text + except requests.RequestException as e: + print(f"Error downloading {url}: {e}") + return "" |
