feat: initialize HTML scraper API with Flask and SeleniumBase

2026-02-13 16:03:35 +01:00
commit 9659382d62
16 changed files with 1707 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@@ -0,0 +1,11 @@
 FLASK_ENV=development
 FLASK_DEBUG=1
 # Selenium
 HEADLESS_BROWSER=True
 DEFAULT_WAIT_TIMEOUT=15
 PAGE_LOAD_WAIT=3
 # Server
 PORT=4001
 SECRET_KEY=change-me-in-production
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,33 @@
 venv/
 env/
 .venv/
 ENV/
 .env
 __pycache__/
 *.py[cod]
 *$py.class
 *.so
 .Python
 dist/
 build/
 *.egg-info/
 *.log
 logs/
 .coverage
 htmlcov/
 .pytest_cache/
 .idea/
 .vscode/
 *.swp
 *.swo
 .DS_Store
 downloaded_files/
 # uv
 .python-version
--- a/48
+++ b/48
@@ -0,0 +1,48 @@
 FROM python:3.12-slim
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
 # Install system dependencies for Chrome / SeleniumBase
 RUN apt-get update && apt-get install -y --no-install-recommends \
    wget \
    gnupg2 \
    unzip \
    curl \
    fonts-liberation \
    libasound2 \
    libatk-bridge2.0-0 \
    libatk1.0-0 \
    libcups2 \
    libdbus-1-3 \
    libdrm2 \
    libgbm1 \
    libgtk-3-0 \
    libnspr4 \
    libnss3 \
    libxcomposite1 \
    libxdamage1 \
    libxrandr2 \
    xdg-utils \
    && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
 ENV UV_NO_DEV=1
 # Install dependencies (cached layer)
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=uv.lock,target=uv.lock \
    --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
    uv sync --locked --no-install-project
 COPY . /app
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv sync --locked
 ENV HEADLESS_BROWSER=True
 ENV PORT=4001
 EXPOSE 4001
 CMD ["uv", "run", "gunicorn", "--bind", "0.0.0.0:4001", "--timeout", "120", "--workers", "2", "run:app"]
--- a/README.md
+++ b/README.md
@@ -0,0 +1,68 @@
 # HTML Scraper
 A simple Python API that exposes a single route to return the HTML content of any page, using Flask and SeleniumBase.
 ## Stack
 - **Python 3.12** with **uv** for dependency management
 - **Flask** as web framework
 - **SeleniumBase** (undetected Chrome) for page rendering
 - **Gunicorn** as production WSGI server
 - **Docker** for containerization
 ## Setup
 ### Local development
 ```bash
 # Install dependencies
 uv sync
 # Copy and edit environment variables
 cp .env.example .env
 # Run the server
 uv run python run.py
 ```
 ### Docker
 ```bash
 # Build
 docker build -t html-scraper .
 # Run
 docker run -p 4001:4001 --env-file .env html-scraper
 ```
 ## API
 ### Health check
 ```
 GET /api/health
 ```
 Response:
 ```json
 {"status": "ok"}
 ```
 ### Scrape HTML
 ```
 POST /api/scrape
 Content-Type: application/json
 {
  "url": "https://example.com"
 }
 ```
 Response:
 ```json
 {
  "success": true,
  "html": "<!DOCTYPE html>..."
 }
 ```
--- a/app/init.py
+++ b/app/init.py
@@ -0,0 +1,15 @@
 from flask import Flask
 from app.config.settings import Config
 def create_app(config_class=Config):
    app = Flask(__name__)
    app.config.from_object(config_class)
    from app.routes.health_routes import health_bp
    from app.routes.scrape_routes import scrape_bp
    app.register_blueprint(health_bp)
    app.register_blueprint(scrape_bp)
    return app
--- a/app/config/init.py
+++ b/app/config/init.py
--- a/app/config/settings.py
+++ b/app/config/settings.py
@@ -0,0 +1,28 @@
 import os
 class BaseConfig:
    SECRET_KEY = os.environ.get('SECRET_KEY') or 'dev-key-for-development-only'
    DEBUG = False
    TESTING = False
    HEADLESS_BROWSER = os.environ.get('HEADLESS_BROWSER', 'True').lower() == 'true'
    DEFAULT_WAIT_TIMEOUT = int(os.environ.get('DEFAULT_WAIT_TIMEOUT') or 15)
    PAGE_LOAD_WAIT = int(os.environ.get('PAGE_LOAD_WAIT') or 3)
 class DevelopmentConfig(BaseConfig):
    DEBUG = True
 class ProductionConfig(BaseConfig):
    pass
 config_by_name = {
    'development': DevelopmentConfig,
    'production': ProductionConfig,
    'default': DevelopmentConfig,
 }
 Config = config_by_name[os.environ.get('FLASK_ENV', 'development')]
--- a/app/routes/init.py
+++ b/app/routes/init.py
--- a/app/routes/health_routes.py
+++ b/app/routes/health_routes.py
@@ -0,0 +1,12 @@
 from flask import Blueprint, jsonify
 import logging
 logger = logging.getLogger(__name__)
 health_bp = Blueprint('health', __name__, url_prefix='/api')
@health_bp.route('/health', methods=['GET'])
 def health_check():
    logger.info("Health check requested")
    return jsonify({"status": "ok"})
--- a/app/routes/scrape_routes.py
+++ b/app/routes/scrape_routes.py
@@ -0,0 +1,40 @@
 from flask import Blueprint, request, jsonify
 from app.scraper.html_scraper import HtmlScraper
 import logging
 logger = logging.getLogger(__name__)
 scrape_bp = Blueprint('scrape', __name__, url_prefix='/api')
@scrape_bp.route('/scrape', methods=['POST'])
 def scrape_html():
    """
    Return the HTML content of a given page.
    Expected JSON payload:
    {
        "url": "https://example.com"
    }
    """
    try:
        data = request.get_json()
        if not data or 'url' not in data:
            logger.warning("Missing url in request")
            return jsonify({"success": False, "error": "Missing url in request"}), 400
        target_url = data['url']
        logger.info(f"Scraping HTML for URL: {target_url}")
        scraper = HtmlScraper()
        try:
            html = scraper.get_page_html(target_url)
            return jsonify({"success": True, "html": html})
        finally:
            scraper.close()
    except Exception as e:
        logger.error(f"Error scraping page: {str(e)}")
        return jsonify({"success": False, "error": str(e)}), 500
--- a/app/scraper/init.py
+++ b/app/scraper/init.py
--- a/app/scraper/html_scraper.py
+++ b/app/scraper/html_scraper.py
@@ -0,0 +1,84 @@
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.common.exceptions import WebDriverException
 from seleniumbase import Driver
 from app.config.settings import Config
 import logging
 import os
 import tempfile
 import shutil
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 )
 class HtmlScraper:
    """Scraper that returns the full HTML of a page using SeleniumBase."""
    def __init__(self, headless=None):
        self.logger = logging.getLogger(self.__class__.__name__)
        self.driver = None
        self.wait = None
        self.headless = headless if headless is not None else Config.HEADLESS_BROWSER
        self.wait_timeout = Config.DEFAULT_WAIT_TIMEOUT
        self._setup_browser()
    def _setup_browser(self):
        try:
            cache_dir = os.path.join(os.path.expanduser("~"), ".browser_cache")
            os.makedirs(cache_dir, exist_ok=True)
            temp_dir = tempfile.gettempdir()
            if shutil.disk_usage(temp_dir).free < 1024 * 1024 * 200:
                self.logger.info("Low disk space detected, cleaning temporary files")
                for item in os.listdir(temp_dir):
                    if item.startswith('tmp') or item.startswith('selenium'):
                        try:
                            path = os.path.join(temp_dir, item)
                            if os.path.isdir(path):
                                shutil.rmtree(path, ignore_errors=True)
                            else:
                                os.remove(path)
                        except Exception as e:
                            self.logger.warning(f"Failed to clean temp file {item}: {e}")
            self.driver = Driver(uc=True, headless=self.headless)
            self.wait = WebDriverWait(self.driver, self.wait_timeout)
            self.driver.set_script_timeout(30)
            self.logger.info("Browser initialized successfully")
        except WebDriverException as e:
            self.logger.error(f"Failed to initialize browser: {str(e)}")
            raise Exception(f"Browser initialization failed: {str(e)}")
    def get_page_html(self, url):
        """
        Navigate to a URL and return the full page HTML.
        Args:
            url: Target URL to scrape.
        Returns:
            The page HTML as a string.
        """
        self.logger.info(f"Navigating to: {url}")
        self.driver.get(url)
        import time
        time.sleep(Config.PAGE_LOAD_WAIT)
        html = self.driver.page_source
        self.logger.info(f"Retrieved HTML ({len(html)} chars) from {url}")
        return html
    def close(self):
        if self.driver:
            try:
                self.driver.quit()
                self.logger.info("Browser closed")
            except Exception as e:
                self.logger.error(f"Error closing browser: {str(e)}")
    def __del__(self):
        self.close()
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,18 @@
 services:
  html-scraper:
    platform: linux/amd64
    build:
      context: .
      dockerfile: Dockerfile
    ports:
      - "${PORT:-4001}"
    environment:  
      - FLASK_ENV=${FLASK_ENV:-production}
      - FLASK_DEBUG=${FLASK_DEBUG:-0}
      - HEADLESS_BROWSER=${HEADLESS_BROWSER:-True}
      - DEFAULT_WAIT_TIMEOUT=${DEFAULT_WAIT_TIMEOUT:-15}
      - PAGE_LOAD_WAIT=${PAGE_LOAD_WAIT:-3}
      - PORT=${PORT:-4001}
      - SECRET_KEY=${SECRET_KEY:-change-me-in-production}
    restart: unless-stopped
    shm_size: "2gb"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,18 @@
 [project]
 name = "html-scraper"
 version = "0.1.0"
 description = "A simple API to scrape and return HTML content from any page using SeleniumBase"
 requires-python = ">=3.11"
 dependencies = [
    "flask>=2.3.3,<4",
    "flask-cors>=4.0.0,<5",
    "gunicorn>=21.2.0,<23",
    "python-dotenv>=1.0.0,<2",
    "selenium>=4.15.2,<5",
    "seleniumbase>=4.39.2,<5",
 ]
 [dependency-groups]
 dev = [
    "pytest>=7.0",
 ]
--- a/run.py
+++ b/run.py
@@ -0,0 +1,12 @@
 from dotenv import load_dotenv
 import os
 load_dotenv()
 from app import create_app
 app = create_app()
 if __name__ == '__main__':
    port = int(os.environ.get('PORT', 4001))
    app.run(host='0.0.0.0', port=port)
--- a/uv.lock
+++ b/uv.lock