feat: initialize HTML scraper API with Flask and SeleniumBase

2026-02-13 16:03:35 +01:00
commit 9659382d62
16 changed files with 1707 additions and 0 deletions
--- a/app/init.py
+++ b/app/init.py
@@ -0,0 +1,15 @@
+from flask import Flask
+from app.config.settings import Config
+
+
+def create_app(config_class=Config):
+    app = Flask(__name__)
+    app.config.from_object(config_class)
+
+    from app.routes.health_routes import health_bp
+    from app.routes.scrape_routes import scrape_bp
+
+    app.register_blueprint(health_bp)
+    app.register_blueprint(scrape_bp)
+
+    return app
--- a/app/config/init.py
+++ b/app/config/init.py
--- a/app/config/settings.py
+++ b/app/config/settings.py
@@ -0,0 +1,28 @@
+import os
+
+
+class BaseConfig:
+    SECRET_KEY = os.environ.get('SECRET_KEY') or 'dev-key-for-development-only'
+    DEBUG = False
+    TESTING = False
+
+    HEADLESS_BROWSER = os.environ.get('HEADLESS_BROWSER', 'True').lower() == 'true'
+    DEFAULT_WAIT_TIMEOUT = int(os.environ.get('DEFAULT_WAIT_TIMEOUT') or 15)
+    PAGE_LOAD_WAIT = int(os.environ.get('PAGE_LOAD_WAIT') or 3)
+
+
+class DevelopmentConfig(BaseConfig):
+    DEBUG = True
+
+
+class ProductionConfig(BaseConfig):
+    pass
+
+
+config_by_name = {
+    'development': DevelopmentConfig,
+    'production': ProductionConfig,
+    'default': DevelopmentConfig,
+}
+
+Config = config_by_name[os.environ.get('FLASK_ENV', 'development')]
--- a/app/routes/init.py
+++ b/app/routes/init.py
--- a/app/routes/health_routes.py
+++ b/app/routes/health_routes.py
@@ -0,0 +1,12 @@
+from flask import Blueprint, jsonify
+import logging
+
+logger = logging.getLogger(__name__)
+
+health_bp = Blueprint('health', __name__, url_prefix='/api')
+
+
+@health_bp.route('/health', methods=['GET'])
+def health_check():
+    logger.info("Health check requested")
+    return jsonify({"status": "ok"})
--- a/app/routes/scrape_routes.py
+++ b/app/routes/scrape_routes.py
@@ -0,0 +1,40 @@
+from flask import Blueprint, request, jsonify
+from app.scraper.html_scraper import HtmlScraper
+import logging
+
+logger = logging.getLogger(__name__)
+
+scrape_bp = Blueprint('scrape', __name__, url_prefix='/api')
+
+
+@scrape_bp.route('/scrape', methods=['POST'])
+def scrape_html():
+    """
+    Return the HTML content of a given page.
+
+    Expected JSON payload:
+    {
+        "url": "https://example.com"
+    }
+    """
+    try:
+        data = request.get_json()
+
+        if not data or 'url' not in data:
+            logger.warning("Missing url in request")
+            return jsonify({"success": False, "error": "Missing url in request"}), 400
+
+        target_url = data['url']
+        logger.info(f"Scraping HTML for URL: {target_url}")
+
+        scraper = HtmlScraper()
+
+        try:
+            html = scraper.get_page_html(target_url)
+            return jsonify({"success": True, "html": html})
+        finally:
+            scraper.close()
+
+    except Exception as e:
+        logger.error(f"Error scraping page: {str(e)}")
+        return jsonify({"success": False, "error": str(e)}), 500
--- a/app/scraper/init.py
+++ b/app/scraper/init.py
--- a/app/scraper/html_scraper.py
+++ b/app/scraper/html_scraper.py
@@ -0,0 +1,84 @@
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.common.exceptions import WebDriverException
+from seleniumbase import Driver
+from app.config.settings import Config
+import logging
+import os
+import tempfile
+import shutil
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+
+
+class HtmlScraper:
+    """Scraper that returns the full HTML of a page using SeleniumBase."""
+
+    def __init__(self, headless=None):
+        self.logger = logging.getLogger(self.__class__.__name__)
+        self.driver = None
+        self.wait = None
+        self.headless = headless if headless is not None else Config.HEADLESS_BROWSER
+        self.wait_timeout = Config.DEFAULT_WAIT_TIMEOUT
+        self._setup_browser()
+
+    def _setup_browser(self):
+        try:
+            cache_dir = os.path.join(os.path.expanduser("~"), ".browser_cache")
+            os.makedirs(cache_dir, exist_ok=True)
+
+            temp_dir = tempfile.gettempdir()
+            if shutil.disk_usage(temp_dir).free < 1024 * 1024 * 200:
+                self.logger.info("Low disk space detected, cleaning temporary files")
+                for item in os.listdir(temp_dir):
+                    if item.startswith('tmp') or item.startswith('selenium'):
+                        try:
+                            path = os.path.join(temp_dir, item)
+                            if os.path.isdir(path):
+                                shutil.rmtree(path, ignore_errors=True)
+                            else:
+                                os.remove(path)
+                        except Exception as e:
+                            self.logger.warning(f"Failed to clean temp file {item}: {e}")
+
+            self.driver = Driver(uc=True, headless=self.headless)
+            self.wait = WebDriverWait(self.driver, self.wait_timeout)
+            self.driver.set_script_timeout(30)
+            self.logger.info("Browser initialized successfully")
+
+        except WebDriverException as e:
+            self.logger.error(f"Failed to initialize browser: {str(e)}")
+            raise Exception(f"Browser initialization failed: {str(e)}")
+
+    def get_page_html(self, url):
+        """
+        Navigate to a URL and return the full page HTML.
+
+        Args:
+            url: Target URL to scrape.
+
+        Returns:
+            The page HTML as a string.
+        """
+        self.logger.info(f"Navigating to: {url}")
+        self.driver.get(url)
+
+        import time
+        time.sleep(Config.PAGE_LOAD_WAIT)
+
+        html = self.driver.page_source
+        self.logger.info(f"Retrieved HTML ({len(html)} chars) from {url}")
+        return html
+
+    def close(self):
+        if self.driver:
+            try:
+                self.driver.quit()
+                self.logger.info("Browser closed")
+            except Exception as e:
+                self.logger.error(f"Error closing browser: {str(e)}")
+
+    def __del__(self):
+        self.close()