html-scraper/app/scraper/html_scraper.py

from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import WebDriverException
from seleniumbase import Driver
from app.config.settings import Config
import logging
import os
import tempfile
import shutil

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)


class HtmlScraper:
    """Scraper that returns the full HTML of a page using SeleniumBase."""

    def __init__(self, headless=None):
        self.logger = logging.getLogger(self.__class__.__name__)
        self.driver = None
        self.wait = None
        self.headless = headless if headless is not None else Config.HEADLESS_BROWSER
        self.wait_timeout = Config.DEFAULT_WAIT_TIMEOUT
        self._setup_browser()

    def _setup_browser(self):
        try:
            cache_dir = os.path.join(os.path.expanduser("~"), ".browser_cache")
            os.makedirs(cache_dir, exist_ok=True)

            temp_dir = tempfile.gettempdir()
            if shutil.disk_usage(temp_dir).free < 1024 * 1024 * 200:
                self.logger.info("Low disk space detected, cleaning temporary files")
                for item in os.listdir(temp_dir):
                    if item.startswith('tmp') or item.startswith('selenium'):
                        try:
                            path = os.path.join(temp_dir, item)
                            if os.path.isdir(path):
                                shutil.rmtree(path, ignore_errors=True)
                            else:
                                os.remove(path)
                        except Exception as e:
                            self.logger.warning(f"Failed to clean temp file {item}: {e}")

            self.driver = Driver(uc=True, headless=self.headless)
            self.wait = WebDriverWait(self.driver, self.wait_timeout)
            self.driver.set_script_timeout(30)
            self.logger.info("Browser initialized successfully")

        except WebDriverException as e:
            self.logger.error(f"Failed to initialize browser: {str(e)}")
            raise Exception(f"Browser initialization failed: {str(e)}")

    def get_page_html(self, url):
        """
        Navigate to a URL and return the full page HTML.

        Args:
            url: Target URL to scrape.

        Returns:
            The page HTML as a string.
        """
        self.logger.info(f"Navigating to: {url}")
        self.driver.get(url)

        import time
        time.sleep(Config.PAGE_LOAD_WAIT)

        html = self.driver.page_source
        self.logger.info(f"Retrieved HTML ({len(html)} chars) from {url}")
        return html

    def close(self):
        if self.driver:
            try:
                self.driver.quit()
                self.logger.info("Browser closed")
            except Exception as e:
                self.logger.error(f"Error closing browser: {str(e)}")

    def __del__(self):
        self.close()