init commit

author: Pinapelz <yukais@pinapelz.com> 2025-04-12 00:18:23 -0700
committer: Pinapelz <yukais@pinapelz.com> 2025-04-12 00:18:23 -0700
commit: dc0794ae2c08a35c30ae8de2d44e2a21c7875252 (patch)
tree: 110b4c0f636c099038e5d5e39b27c4d165f9d867 /site_scraper.py
1 files changed, 58 insertions, 0 deletions
diff --git a/site_scraper.py b/site_scraper.py
new file mode 100644
index 0000000..0a49c60
--- /dev/null
+++ b/site_scraper.py
@@ -0,0 +1,58 @@
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.chrome.options import Options as ChromeOptions
+from selenium import webdriver
+from dotenv import load_dotenv
+import time
+import os
+
+load_dotenv()
+
+class SiteScraper:
+    def __init__(self, headless: bool = False, wait_time = 5):
+        """
+        Initialize the SiteScraper with the path to ChromeDriver
+        :param chrome_driver_path: Path to the ChromeDriver executable
+        :param headless: Run the browser in headless mode if True
+        """
+        self.wait_time = wait_time
+        try:
+            self.service = Service(os.environ.get("CHROME_DRIVER_PATH"))
+            self.chrome_options = ChromeOptions()
+
+            if headless:
+                self.chrome_options.add_argument("--headless")
+                self.chrome_options.add_argument("--disable-gpu")
+                self.chrome_options.add_argument("--disable-dev-shm-usage")
+                self.chrome_options.add_argument("--window-size=1920,1080")
+                self.chrome_options.add_argument("--no-sandbox")
+
+            self.driver = webdriver.Chrome(service=self.service, options=self.chrome_options)
+        except FileNotFoundError:
+            print("The ChromeDriver executable was not found. Is it installed and accessible in PATH?")
+            quit()
+        except Exception as e:
+            print(f"An unknown error occurred: {e}")
+            quit()
+
+    def get_page_source(self, url) -> str:
+        """
+        Get the page source of the given URL
+        :param url: The URL of the page to scrape
+        :param wait_time: The time to wait for the page to load (for JavaScript)
+        """
+        try:
+            self.driver.get(url)
+        except Exception as e:
+            print(f"An error occurred while trying to get the page source: {e}")
+            return ""
+        if self.wait_time > 0:
+            time.sleep(self.wait_time)
+        return self.driver.page_source
+
+    def close(self):
+        """
+        Close the WebDriver
+        """
+        self.driver.quit()
+        self.service.stop()
+        print("WebDriver closed successfully")
author	Pinapelz <yukais@pinapelz.com>	2025-04-12 00:18:23 -0700
committer	Pinapelz <yukais@pinapelz.com>	2025-04-12 00:18:23 -0700
commit	dc0794ae2c08a35c30ae8de2d44e2a21c7875252 (patch)
tree	110b4c0f636c099038e5d5e39b27c4d165f9d867 /site_scraper.py