feat: initialize HTML scraper API with Flask and SeleniumBase

2026-02-13 16:03:35 +01:00
commit 9659382d62
16 changed files with 1707 additions and 0 deletions
--- a/app/routes/init.py
+++ b/app/routes/init.py
--- a/app/routes/health_routes.py
+++ b/app/routes/health_routes.py
@@ -0,0 +1,12 @@
+from flask import Blueprint, jsonify
+import logging
+
+logger = logging.getLogger(__name__)
+
+health_bp = Blueprint('health', __name__, url_prefix='/api')
+
+
+@health_bp.route('/health', methods=['GET'])
+def health_check():
+    logger.info("Health check requested")
+    return jsonify({"status": "ok"})
--- a/app/routes/scrape_routes.py
+++ b/app/routes/scrape_routes.py
@@ -0,0 +1,40 @@
+from flask import Blueprint, request, jsonify
+from app.scraper.html_scraper import HtmlScraper
+import logging
+
+logger = logging.getLogger(__name__)
+
+scrape_bp = Blueprint('scrape', __name__, url_prefix='/api')
+
+
+@scrape_bp.route('/scrape', methods=['POST'])
+def scrape_html():
+    """
+    Return the HTML content of a given page.
+
+    Expected JSON payload:
+    {
+        "url": "https://example.com"
+    }
+    """
+    try:
+        data = request.get_json()
+
+        if not data or 'url' not in data:
+            logger.warning("Missing url in request")
+            return jsonify({"success": False, "error": "Missing url in request"}), 400
+
+        target_url = data['url']
+        logger.info(f"Scraping HTML for URL: {target_url}")
+
+        scraper = HtmlScraper()
+
+        try:
+            html = scraper.get_page_html(target_url)
+            return jsonify({"success": True, "html": html})
+        finally:
+            scraper.close()
+
+    except Exception as e:
+        logger.error(f"Error scraping page: {str(e)}")
+        return jsonify({"success": False, "error": str(e)}), 500