85 lines
2.9 KiB
Python
85 lines
2.9 KiB
Python
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.common.exceptions import WebDriverException
|
|
from seleniumbase import Driver
|
|
from app.config.settings import Config
|
|
import logging
|
|
import os
|
|
import tempfile
|
|
import shutil
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
)
|
|
|
|
|
|
class HtmlScraper:
|
|
"""Scraper that returns the full HTML of a page using SeleniumBase."""
|
|
|
|
def __init__(self, headless=None):
|
|
self.logger = logging.getLogger(self.__class__.__name__)
|
|
self.driver = None
|
|
self.wait = None
|
|
self.headless = headless if headless is not None else Config.HEADLESS_BROWSER
|
|
self.wait_timeout = Config.DEFAULT_WAIT_TIMEOUT
|
|
self._setup_browser()
|
|
|
|
def _setup_browser(self):
|
|
try:
|
|
cache_dir = os.path.join(os.path.expanduser("~"), ".browser_cache")
|
|
os.makedirs(cache_dir, exist_ok=True)
|
|
|
|
temp_dir = tempfile.gettempdir()
|
|
if shutil.disk_usage(temp_dir).free < 1024 * 1024 * 200:
|
|
self.logger.info("Low disk space detected, cleaning temporary files")
|
|
for item in os.listdir(temp_dir):
|
|
if item.startswith('tmp') or item.startswith('selenium'):
|
|
try:
|
|
path = os.path.join(temp_dir, item)
|
|
if os.path.isdir(path):
|
|
shutil.rmtree(path, ignore_errors=True)
|
|
else:
|
|
os.remove(path)
|
|
except Exception as e:
|
|
self.logger.warning(f"Failed to clean temp file {item}: {e}")
|
|
|
|
self.driver = Driver(uc=True, headless=self.headless)
|
|
self.wait = WebDriverWait(self.driver, self.wait_timeout)
|
|
self.driver.set_script_timeout(30)
|
|
self.logger.info("Browser initialized successfully")
|
|
|
|
except WebDriverException as e:
|
|
self.logger.error(f"Failed to initialize browser: {str(e)}")
|
|
raise Exception(f"Browser initialization failed: {str(e)}")
|
|
|
|
def get_page_html(self, url):
|
|
"""
|
|
Navigate to a URL and return the full page HTML.
|
|
|
|
Args:
|
|
url: Target URL to scrape.
|
|
|
|
Returns:
|
|
The page HTML as a string.
|
|
"""
|
|
self.logger.info(f"Navigating to: {url}")
|
|
self.driver.get(url)
|
|
|
|
import time
|
|
time.sleep(Config.PAGE_LOAD_WAIT)
|
|
|
|
html = self.driver.page_source
|
|
self.logger.info(f"Retrieved HTML ({len(html)} chars) from {url}")
|
|
return html
|
|
|
|
def close(self):
|
|
if self.driver:
|
|
try:
|
|
self.driver.quit()
|
|
self.logger.info("Browser closed")
|
|
except Exception as e:
|
|
self.logger.error(f"Error closing browser: {str(e)}")
|
|
|
|
def __del__(self):
|
|
self.close()
|