aboutsummaryrefslogtreecommitdiffstats
path: root/site_scraper.py
diff options
context:
space:
mode:
authorPinapelz <yukais@pinapelz.com>2025-04-15 01:37:45 -0700
committerPinapelz <yukais@pinapelz.com>2025-04-15 01:37:45 -0700
commitdb96364d2301c79a05998be0fcf27e6013517b22 (patch)
tree433142407b3bba552c510ec4431b1a7ee5684c7b /site_scraper.py
parentc9d2521b7beb5b5d1077565ec968cb3421497417 (diff)
optimization: pull site using requests for those that don't need JS
Diffstat (limited to 'site_scraper.py')
-rw-r--r--site_scraper.py24
1 files changed, 24 insertions, 0 deletions
diff --git a/site_scraper.py b/site_scraper.py
index f801c20..9efa4b6 100644
--- a/site_scraper.py
+++ b/site_scraper.py
@@ -2,6 +2,7 @@ from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium import webdriver
from dotenv import load_dotenv
+import requests
import time
import os
@@ -65,3 +66,26 @@ class SiteScraper:
self.driver.quit()
self.service.stop()
print("WebDriver closed successfully")
+
+
+def download_site_as_html(url: str, timeout: int = 10) -> str:
+ headers = {
+ "User-Agent": (
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
+ "Chrome/122.0.0.0 Safari/537.36"
+ ),
+ "Accept": (
+ "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"
+ ),
+ "Accept-Language": "en-US,en;q=0.9",
+ "Connection": "keep-alive",
+ }
+
+ try:
+ response = requests.get(url, headers=headers, timeout=timeout)
+ response.raise_for_status()
+ return response.text
+ except requests.RequestException as e:
+ print(f"Error downloading {url}: {e}")
+ return ""
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage