from selenium.webdriver.support.ui import WebDriverWait from selenium.common.exceptions import WebDriverException from seleniumbase import Driver from app.config.settings import Config import logging import os import tempfile import shutil logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) class HtmlScraper: """Scraper that returns the full HTML of a page using SeleniumBase.""" def __init__(self, headless=None): self.logger = logging.getLogger(self.__class__.__name__) self.driver = None self.wait = None self.headless = headless if headless is not None else Config.HEADLESS_BROWSER self.wait_timeout = Config.DEFAULT_WAIT_TIMEOUT self._setup_browser() def _setup_browser(self): try: cache_dir = os.path.join(os.path.expanduser("~"), ".browser_cache") os.makedirs(cache_dir, exist_ok=True) temp_dir = tempfile.gettempdir() if shutil.disk_usage(temp_dir).free < 1024 * 1024 * 200: self.logger.info("Low disk space detected, cleaning temporary files") for item in os.listdir(temp_dir): if item.startswith('tmp') or item.startswith('selenium'): try: path = os.path.join(temp_dir, item) if os.path.isdir(path): shutil.rmtree(path, ignore_errors=True) else: os.remove(path) except Exception as e: self.logger.warning(f"Failed to clean temp file {item}: {e}") self.driver = Driver(uc=True, headless=self.headless) self.wait = WebDriverWait(self.driver, self.wait_timeout) self.driver.set_script_timeout(30) self.logger.info("Browser initialized successfully") except WebDriverException as e: self.logger.error(f"Failed to initialize browser: {str(e)}") raise Exception(f"Browser initialization failed: {str(e)}") def get_page_html(self, url): """ Navigate to a URL and return the full page HTML. Args: url: Target URL to scrape. Returns: The page HTML as a string. """ self.logger.info(f"Navigating to: {url}") self.driver.get(url) import time time.sleep(Config.PAGE_LOAD_WAIT) html = self.driver.page_source self.logger.info(f"Retrieved HTML ({len(html)} chars) from {url}") return html def close(self): if self.driver: try: self.driver.quit() self.logger.info("Browser closed") except Exception as e: self.logger.error(f"Error closing browser: {str(e)}") def __del__(self): self.close()