diff options
| author | Pinapelz <yukais@pinapelz.com> | 2025-04-12 00:18:23 -0700 |
|---|---|---|
| committer | Pinapelz <yukais@pinapelz.com> | 2025-04-12 00:18:23 -0700 |
| commit | dc0794ae2c08a35c30ae8de2d44e2a21c7875252 (patch) | |
| tree | 110b4c0f636c099038e5d5e39b27c4d165f9d867 /site_scraper.py | |
init commit
Diffstat (limited to 'site_scraper.py')
| -rw-r--r-- | site_scraper.py | 58 |
1 files changed, 58 insertions, 0 deletions
diff --git a/site_scraper.py b/site_scraper.py new file mode 100644 index 0000000..0a49c60 --- /dev/null +++ b/site_scraper.py @@ -0,0 +1,58 @@ +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.chrome.options import Options as ChromeOptions +from selenium import webdriver +from dotenv import load_dotenv +import time +import os + +load_dotenv() + +class SiteScraper: + def __init__(self, headless: bool = False, wait_time = 5): + """ + Initialize the SiteScraper with the path to ChromeDriver + :param chrome_driver_path: Path to the ChromeDriver executable + :param headless: Run the browser in headless mode if True + """ + self.wait_time = wait_time + try: + self.service = Service(os.environ.get("CHROME_DRIVER_PATH")) + self.chrome_options = ChromeOptions() + + if headless: + self.chrome_options.add_argument("--headless") + self.chrome_options.add_argument("--disable-gpu") + self.chrome_options.add_argument("--disable-dev-shm-usage") + self.chrome_options.add_argument("--window-size=1920,1080") + self.chrome_options.add_argument("--no-sandbox") + + self.driver = webdriver.Chrome(service=self.service, options=self.chrome_options) + except FileNotFoundError: + print("The ChromeDriver executable was not found. Is it installed and accessible in PATH?") + quit() + except Exception as e: + print(f"An unknown error occurred: {e}") + quit() + + def get_page_source(self, url) -> str: + """ + Get the page source of the given URL + :param url: The URL of the page to scrape + :param wait_time: The time to wait for the page to load (for JavaScript) + """ + try: + self.driver.get(url) + except Exception as e: + print(f"An error occurred while trying to get the page source: {e}") + return "" + if self.wait_time > 0: + time.sleep(self.wait_time) + return self.driver.page_source + + def close(self): + """ + Close the WebDriver + """ + self.driver.quit() + self.service.stop() + print("WebDriver closed successfully") |
