feat: initialize HTML scraper API with Flask and SeleniumBase
This commit is contained in:
15
app/__init__.py
Normal file
15
app/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from flask import Flask
|
||||
from app.config.settings import Config
|
||||
|
||||
|
||||
def create_app(config_class=Config):
|
||||
app = Flask(__name__)
|
||||
app.config.from_object(config_class)
|
||||
|
||||
from app.routes.health_routes import health_bp
|
||||
from app.routes.scrape_routes import scrape_bp
|
||||
|
||||
app.register_blueprint(health_bp)
|
||||
app.register_blueprint(scrape_bp)
|
||||
|
||||
return app
|
||||
0
app/config/__init__.py
Normal file
0
app/config/__init__.py
Normal file
28
app/config/settings.py
Normal file
28
app/config/settings.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import os
|
||||
|
||||
|
||||
class BaseConfig:
|
||||
SECRET_KEY = os.environ.get('SECRET_KEY') or 'dev-key-for-development-only'
|
||||
DEBUG = False
|
||||
TESTING = False
|
||||
|
||||
HEADLESS_BROWSER = os.environ.get('HEADLESS_BROWSER', 'True').lower() == 'true'
|
||||
DEFAULT_WAIT_TIMEOUT = int(os.environ.get('DEFAULT_WAIT_TIMEOUT') or 15)
|
||||
PAGE_LOAD_WAIT = int(os.environ.get('PAGE_LOAD_WAIT') or 3)
|
||||
|
||||
|
||||
class DevelopmentConfig(BaseConfig):
|
||||
DEBUG = True
|
||||
|
||||
|
||||
class ProductionConfig(BaseConfig):
|
||||
pass
|
||||
|
||||
|
||||
config_by_name = {
|
||||
'development': DevelopmentConfig,
|
||||
'production': ProductionConfig,
|
||||
'default': DevelopmentConfig,
|
||||
}
|
||||
|
||||
Config = config_by_name[os.environ.get('FLASK_ENV', 'development')]
|
||||
0
app/routes/__init__.py
Normal file
0
app/routes/__init__.py
Normal file
12
app/routes/health_routes.py
Normal file
12
app/routes/health_routes.py
Normal file
@@ -0,0 +1,12 @@
|
||||
from flask import Blueprint, jsonify
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
health_bp = Blueprint('health', __name__, url_prefix='/api')
|
||||
|
||||
|
||||
@health_bp.route('/health', methods=['GET'])
|
||||
def health_check():
|
||||
logger.info("Health check requested")
|
||||
return jsonify({"status": "ok"})
|
||||
40
app/routes/scrape_routes.py
Normal file
40
app/routes/scrape_routes.py
Normal file
@@ -0,0 +1,40 @@
|
||||
from flask import Blueprint, request, jsonify
|
||||
from app.scraper.html_scraper import HtmlScraper
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
scrape_bp = Blueprint('scrape', __name__, url_prefix='/api')
|
||||
|
||||
|
||||
@scrape_bp.route('/scrape', methods=['POST'])
|
||||
def scrape_html():
|
||||
"""
|
||||
Return the HTML content of a given page.
|
||||
|
||||
Expected JSON payload:
|
||||
{
|
||||
"url": "https://example.com"
|
||||
}
|
||||
"""
|
||||
try:
|
||||
data = request.get_json()
|
||||
|
||||
if not data or 'url' not in data:
|
||||
logger.warning("Missing url in request")
|
||||
return jsonify({"success": False, "error": "Missing url in request"}), 400
|
||||
|
||||
target_url = data['url']
|
||||
logger.info(f"Scraping HTML for URL: {target_url}")
|
||||
|
||||
scraper = HtmlScraper()
|
||||
|
||||
try:
|
||||
html = scraper.get_page_html(target_url)
|
||||
return jsonify({"success": True, "html": html})
|
||||
finally:
|
||||
scraper.close()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error scraping page: {str(e)}")
|
||||
return jsonify({"success": False, "error": str(e)}), 500
|
||||
0
app/scraper/__init__.py
Normal file
0
app/scraper/__init__.py
Normal file
84
app/scraper/html_scraper.py
Normal file
84
app/scraper/html_scraper.py
Normal file
@@ -0,0 +1,84 @@
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.common.exceptions import WebDriverException
|
||||
from seleniumbase import Driver
|
||||
from app.config.settings import Config
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
import shutil
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
|
||||
class HtmlScraper:
|
||||
"""Scraper that returns the full HTML of a page using SeleniumBase."""
|
||||
|
||||
def __init__(self, headless=None):
|
||||
self.logger = logging.getLogger(self.__class__.__name__)
|
||||
self.driver = None
|
||||
self.wait = None
|
||||
self.headless = headless if headless is not None else Config.HEADLESS_BROWSER
|
||||
self.wait_timeout = Config.DEFAULT_WAIT_TIMEOUT
|
||||
self._setup_browser()
|
||||
|
||||
def _setup_browser(self):
|
||||
try:
|
||||
cache_dir = os.path.join(os.path.expanduser("~"), ".browser_cache")
|
||||
os.makedirs(cache_dir, exist_ok=True)
|
||||
|
||||
temp_dir = tempfile.gettempdir()
|
||||
if shutil.disk_usage(temp_dir).free < 1024 * 1024 * 200:
|
||||
self.logger.info("Low disk space detected, cleaning temporary files")
|
||||
for item in os.listdir(temp_dir):
|
||||
if item.startswith('tmp') or item.startswith('selenium'):
|
||||
try:
|
||||
path = os.path.join(temp_dir, item)
|
||||
if os.path.isdir(path):
|
||||
shutil.rmtree(path, ignore_errors=True)
|
||||
else:
|
||||
os.remove(path)
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to clean temp file {item}: {e}")
|
||||
|
||||
self.driver = Driver(uc=True, headless=self.headless)
|
||||
self.wait = WebDriverWait(self.driver, self.wait_timeout)
|
||||
self.driver.set_script_timeout(30)
|
||||
self.logger.info("Browser initialized successfully")
|
||||
|
||||
except WebDriverException as e:
|
||||
self.logger.error(f"Failed to initialize browser: {str(e)}")
|
||||
raise Exception(f"Browser initialization failed: {str(e)}")
|
||||
|
||||
def get_page_html(self, url):
|
||||
"""
|
||||
Navigate to a URL and return the full page HTML.
|
||||
|
||||
Args:
|
||||
url: Target URL to scrape.
|
||||
|
||||
Returns:
|
||||
The page HTML as a string.
|
||||
"""
|
||||
self.logger.info(f"Navigating to: {url}")
|
||||
self.driver.get(url)
|
||||
|
||||
import time
|
||||
time.sleep(Config.PAGE_LOAD_WAIT)
|
||||
|
||||
html = self.driver.page_source
|
||||
self.logger.info(f"Retrieved HTML ({len(html)} chars) from {url}")
|
||||
return html
|
||||
|
||||
def close(self):
|
||||
if self.driver:
|
||||
try:
|
||||
self.driver.quit()
|
||||
self.logger.info("Browser closed")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error closing browser: {str(e)}")
|
||||
|
||||
def __del__(self):
|
||||
self.close()
|
||||
Reference in New Issue
Block a user