feat: initialize HTML scraper API with Flask and SeleniumBase

This commit is contained in:
2026-02-13 16:03:35 +01:00
commit 9659382d62
16 changed files with 1707 additions and 0 deletions

11
.env.example Normal file
View File

@@ -0,0 +1,11 @@
FLASK_ENV=development
FLASK_DEBUG=1
# Selenium
HEADLESS_BROWSER=True
DEFAULT_WAIT_TIMEOUT=15
PAGE_LOAD_WAIT=3
# Server
PORT=4001
SECRET_KEY=change-me-in-production

33
.gitignore vendored Normal file
View File

@@ -0,0 +1,33 @@
venv/
env/
.venv/
ENV/
.env
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
dist/
build/
*.egg-info/
*.log
logs/
.coverage
htmlcov/
.pytest_cache/
.idea/
.vscode/
*.swp
*.swo
.DS_Store
downloaded_files/
# uv
.python-version

48
Dockerfile Normal file
View File

@@ -0,0 +1,48 @@
FROM python:3.12-slim
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
# Install system dependencies for Chrome / SeleniumBase
RUN apt-get update && apt-get install -y --no-install-recommends \
wget \
gnupg2 \
unzip \
curl \
fonts-liberation \
libasound2 \
libatk-bridge2.0-0 \
libatk1.0-0 \
libcups2 \
libdbus-1-3 \
libdrm2 \
libgbm1 \
libgtk-3-0 \
libnspr4 \
libnss3 \
libxcomposite1 \
libxdamage1 \
libxrandr2 \
xdg-utils \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
ENV UV_NO_DEV=1
# Install dependencies (cached layer)
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=uv.lock,target=uv.lock \
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
uv sync --locked --no-install-project
COPY . /app
RUN --mount=type=cache,target=/root/.cache/uv \
uv sync --locked
ENV HEADLESS_BROWSER=True
ENV PORT=4001
EXPOSE 4001
CMD ["uv", "run", "gunicorn", "--bind", "0.0.0.0:4001", "--timeout", "120", "--workers", "2", "run:app"]

68
README.md Normal file
View File

@@ -0,0 +1,68 @@
# HTML Scraper
A simple Python API that exposes a single route to return the HTML content of any page, using Flask and SeleniumBase.
## Stack
- **Python 3.12** with **uv** for dependency management
- **Flask** as web framework
- **SeleniumBase** (undetected Chrome) for page rendering
- **Gunicorn** as production WSGI server
- **Docker** for containerization
## Setup
### Local development
```bash
# Install dependencies
uv sync
# Copy and edit environment variables
cp .env.example .env
# Run the server
uv run python run.py
```
### Docker
```bash
# Build
docker build -t html-scraper .
# Run
docker run -p 4001:4001 --env-file .env html-scraper
```
## API
### Health check
```
GET /api/health
```
Response:
```json
{"status": "ok"}
```
### Scrape HTML
```
POST /api/scrape
Content-Type: application/json
{
"url": "https://example.com"
}
```
Response:
```json
{
"success": true,
"html": "<!DOCTYPE html>..."
}
```

15
app/__init__.py Normal file
View File

@@ -0,0 +1,15 @@
from flask import Flask
from app.config.settings import Config
def create_app(config_class=Config):
app = Flask(__name__)
app.config.from_object(config_class)
from app.routes.health_routes import health_bp
from app.routes.scrape_routes import scrape_bp
app.register_blueprint(health_bp)
app.register_blueprint(scrape_bp)
return app

0
app/config/__init__.py Normal file
View File

28
app/config/settings.py Normal file
View File

@@ -0,0 +1,28 @@
import os
class BaseConfig:
SECRET_KEY = os.environ.get('SECRET_KEY') or 'dev-key-for-development-only'
DEBUG = False
TESTING = False
HEADLESS_BROWSER = os.environ.get('HEADLESS_BROWSER', 'True').lower() == 'true'
DEFAULT_WAIT_TIMEOUT = int(os.environ.get('DEFAULT_WAIT_TIMEOUT') or 15)
PAGE_LOAD_WAIT = int(os.environ.get('PAGE_LOAD_WAIT') or 3)
class DevelopmentConfig(BaseConfig):
DEBUG = True
class ProductionConfig(BaseConfig):
pass
config_by_name = {
'development': DevelopmentConfig,
'production': ProductionConfig,
'default': DevelopmentConfig,
}
Config = config_by_name[os.environ.get('FLASK_ENV', 'development')]

0
app/routes/__init__.py Normal file
View File

View File

@@ -0,0 +1,12 @@
from flask import Blueprint, jsonify
import logging
logger = logging.getLogger(__name__)
health_bp = Blueprint('health', __name__, url_prefix='/api')
@health_bp.route('/health', methods=['GET'])
def health_check():
logger.info("Health check requested")
return jsonify({"status": "ok"})

View File

@@ -0,0 +1,40 @@
from flask import Blueprint, request, jsonify
from app.scraper.html_scraper import HtmlScraper
import logging
logger = logging.getLogger(__name__)
scrape_bp = Blueprint('scrape', __name__, url_prefix='/api')
@scrape_bp.route('/scrape', methods=['POST'])
def scrape_html():
"""
Return the HTML content of a given page.
Expected JSON payload:
{
"url": "https://example.com"
}
"""
try:
data = request.get_json()
if not data or 'url' not in data:
logger.warning("Missing url in request")
return jsonify({"success": False, "error": "Missing url in request"}), 400
target_url = data['url']
logger.info(f"Scraping HTML for URL: {target_url}")
scraper = HtmlScraper()
try:
html = scraper.get_page_html(target_url)
return jsonify({"success": True, "html": html})
finally:
scraper.close()
except Exception as e:
logger.error(f"Error scraping page: {str(e)}")
return jsonify({"success": False, "error": str(e)}), 500

0
app/scraper/__init__.py Normal file
View File

View File

@@ -0,0 +1,84 @@
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import WebDriverException
from seleniumbase import Driver
from app.config.settings import Config
import logging
import os
import tempfile
import shutil
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
class HtmlScraper:
"""Scraper that returns the full HTML of a page using SeleniumBase."""
def __init__(self, headless=None):
self.logger = logging.getLogger(self.__class__.__name__)
self.driver = None
self.wait = None
self.headless = headless if headless is not None else Config.HEADLESS_BROWSER
self.wait_timeout = Config.DEFAULT_WAIT_TIMEOUT
self._setup_browser()
def _setup_browser(self):
try:
cache_dir = os.path.join(os.path.expanduser("~"), ".browser_cache")
os.makedirs(cache_dir, exist_ok=True)
temp_dir = tempfile.gettempdir()
if shutil.disk_usage(temp_dir).free < 1024 * 1024 * 200:
self.logger.info("Low disk space detected, cleaning temporary files")
for item in os.listdir(temp_dir):
if item.startswith('tmp') or item.startswith('selenium'):
try:
path = os.path.join(temp_dir, item)
if os.path.isdir(path):
shutil.rmtree(path, ignore_errors=True)
else:
os.remove(path)
except Exception as e:
self.logger.warning(f"Failed to clean temp file {item}: {e}")
self.driver = Driver(uc=True, headless=self.headless)
self.wait = WebDriverWait(self.driver, self.wait_timeout)
self.driver.set_script_timeout(30)
self.logger.info("Browser initialized successfully")
except WebDriverException as e:
self.logger.error(f"Failed to initialize browser: {str(e)}")
raise Exception(f"Browser initialization failed: {str(e)}")
def get_page_html(self, url):
"""
Navigate to a URL and return the full page HTML.
Args:
url: Target URL to scrape.
Returns:
The page HTML as a string.
"""
self.logger.info(f"Navigating to: {url}")
self.driver.get(url)
import time
time.sleep(Config.PAGE_LOAD_WAIT)
html = self.driver.page_source
self.logger.info(f"Retrieved HTML ({len(html)} chars) from {url}")
return html
def close(self):
if self.driver:
try:
self.driver.quit()
self.logger.info("Browser closed")
except Exception as e:
self.logger.error(f"Error closing browser: {str(e)}")
def __del__(self):
self.close()

18
docker-compose.yml Normal file
View File

@@ -0,0 +1,18 @@
services:
html-scraper:
platform: linux/amd64
build:
context: .
dockerfile: Dockerfile
ports:
- "${PORT:-4001}"
environment:
- FLASK_ENV=${FLASK_ENV:-production}
- FLASK_DEBUG=${FLASK_DEBUG:-0}
- HEADLESS_BROWSER=${HEADLESS_BROWSER:-True}
- DEFAULT_WAIT_TIMEOUT=${DEFAULT_WAIT_TIMEOUT:-15}
- PAGE_LOAD_WAIT=${PAGE_LOAD_WAIT:-3}
- PORT=${PORT:-4001}
- SECRET_KEY=${SECRET_KEY:-change-me-in-production}
restart: unless-stopped
shm_size: "2gb"

18
pyproject.toml Normal file
View File

@@ -0,0 +1,18 @@
[project]
name = "html-scraper"
version = "0.1.0"
description = "A simple API to scrape and return HTML content from any page using SeleniumBase"
requires-python = ">=3.11"
dependencies = [
"flask>=2.3.3,<4",
"flask-cors>=4.0.0,<5",
"gunicorn>=21.2.0,<23",
"python-dotenv>=1.0.0,<2",
"selenium>=4.15.2,<5",
"seleniumbase>=4.39.2,<5",
]
[dependency-groups]
dev = [
"pytest>=7.0",
]

12
run.py Normal file
View File

@@ -0,0 +1,12 @@
from dotenv import load_dotenv
import os
load_dotenv()
from app import create_app
app = create_app()
if __name__ == '__main__':
port = int(os.environ.get('PORT', 4001))
app.run(host='0.0.0.0', port=port)

1320
uv.lock generated Normal file

File diff suppressed because it is too large Load Diff