diff options
Diffstat (limited to 'site_scraper.py')
| -rw-r--r-- | site_scraper.py | 24 |
1 files changed, 24 insertions, 0 deletions
diff --git a/site_scraper.py b/site_scraper.py index f801c20..9efa4b6 100644 --- a/site_scraper.py +++ b/site_scraper.py @@ -2,6 +2,7 @@ from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options as ChromeOptions from selenium import webdriver from dotenv import load_dotenv +import requests import time import os @@ -65,3 +66,26 @@ class SiteScraper: self.driver.quit() self.service.stop() print("WebDriver closed successfully") + + +def download_site_as_html(url: str, timeout: int = 10) -> str: + headers = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/122.0.0.0 Safari/537.36" + ), + "Accept": ( + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8" + ), + "Accept-Language": "en-US,en;q=0.9", + "Connection": "keep-alive", + } + + try: + response = requests.get(url, headers=headers, timeout=timeout) + response.raise_for_status() + return response.text + except requests.RequestException as e: + print(f"Error downloading {url}: {e}") + return "" |
