diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..d0e2168 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,12 @@ +# Python-generated files +__pycache__/ +*.py[oc] +build/ +dist/ +wheels/ +*.egg-info + +# Virtual environments +.venv + +.git diff --git a/.github/workflows/build-and-publish.yml b/.github/workflows/build-and-publish.yml new file mode 100644 index 0000000..7ed9787 --- /dev/null +++ b/.github/workflows/build-and-publish.yml @@ -0,0 +1,57 @@ +name: Build and Push Docker Image + +on: + push: + branches: [ "main", "master" ] + +jobs: + build: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Log in to GHCR + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v5 + with: + images: | + ${{ secrets.DOCKERHUB_USERNAME }}/${{ github.event.repository.name }} + ghcr.io/${{ github.repository }} + tags: | + type=ref,event=branch + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=raw,value=latest,enable={{is_default_branch}} + + - name: Build and push Docker image + uses: docker/build-push-action@v5 + with: + context: . + file: ./Dockerfile + platforms: linux/amd64 + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max diff --git a/BUILD.md b/BUILD.md new file mode 100644 index 0000000..f9a8989 --- /dev/null +++ b/BUILD.md @@ -0,0 +1,9 @@ +# Build (Docker) + +Also works with Podman. + +```sh +docker build -t 'chromedriver-http-proxy' . +docker run --rm -p "32323:32323" chromedriver-http-proxy +``` + diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..fed03b5 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,30 @@ +FROM ghcr.io/astral-sh/uv:debian-slim + +ENV PROXY_PORT=32323 +ENV PROXY_HOST=0.0.0.0 +ENV SCRAPER_WAIT_TIME=10 +ENV SCRAPER_HEADLESS=True +ENV SCRAPER_USER_AGENT="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36" + +WORKDIR /app +COPY uv.lock pyproject.toml /app +RUN uv sync --locked + +RUN apt-get update && apt-get install -y \ + wget \ + gnupg \ + ca-certificates \ + && wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - \ + && echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list \ + && apt-get update \ + && apt-get install -y google-chrome-stable \ + && rm -rf /var/cache/apt/archives /var/lib/apt/lists/* + +COPY . /app + +CMD uv run proxy.py \ + --port="$PROXY_PORT"\ + --host="$PROXY_HOST"\ + --wait="$SCRAPER_WAIT_TIME" \ + --headless="$SCRAPER_HEADLESS" \ + --user-agent="$SCRAPER_USER_AGENT" diff --git a/README.md b/README.md index d0d6c88..c9b9a24 100644 --- a/README.md +++ b/README.md @@ -6,11 +6,21 @@ Simple HTTP proxy that renders pages with undetected-chromedriver and returns th - Solves [Anubis](https://anubis.techaro.lol/) - Solves [go-away](https://git.gammaspectra.live/git/go-away) -- Solves similiar POW challenges +- Solves similiar [POW challenges](https://git.gammaspectra.live/git/go-away#other-similar-projects) - Sometimes bypasses Cloudflare Turnstile ## Installation +### Container + +```sh +docker run --rm -p "32323:32323" ghcr.io/s4wyer/chromedriver-http-proxy # or s44wyer/chromedriver-http-proxy +``` + +There's also a [Docker compose example](/docker-compose.yml) with better config. + +### System + uv: ```sh @@ -52,7 +62,7 @@ This proxy has no authentication, and I don't plan to add any (PRs welcome thoug ## TODO -- [ ] Docker image +- [ ] ARM Docker images - [ ] Send JS/CSS to the client - [ ] Custom Chromium binary locations - [ ] More CLI arguments to control ChromeDriver behavior @@ -60,6 +70,7 @@ This proxy has no authentication, and I don't plan to add any (PRs welcome thoug - [ ] Screenshot endpoint - [ ] Allow custom headers - [ ] POST requests +- [x] Docker image ## Similiar Projects diff --git a/config.py b/config.py new file mode 100644 index 0000000..2ad821c --- /dev/null +++ b/config.py @@ -0,0 +1,72 @@ +import os +import argparse +from dataclasses import dataclass + + +@dataclass +class ScraperConfig: + wait_time: float + headless: bool + user_agent: str + + +@dataclass +class ServerConfig: + host: str + port: int + + +def get_configs(): + parser = argparse.ArgumentParser(prog="ChromeDriver HTTP Proxy", + description="Simple HTTP proxy that renders pages with undetected-chromedriver and returns the HTML", + usage="") + parser.add_argument( + "--port", + help="Port the proxy runs on.", + required=False, + type=int, + default=os.getenv("PROXY_PORT", 32323) + ) + + parser.add_argument( + "--host", + help="Host the proxy to runs on.", + required=False, + type=str, + default=os.getenv("PROXY_HOST", "0.0.0.0") + ) + + parser.add_argument( + "--wait", + help="Seconds to wait before returning content.", + required=False, + type=float, + default=os.getenv("SCRAPER_WAIT_TIME", 10) + ) + + parser.add_argument( + "--headless", + help="Whether or not to run Chrome headless.", + required=False, + type=bool, + default=os.getenv("SCRAPER_HEADLESS", True) + ) + + parser.add_argument( + "--user-agent", + help="Chrome user agent. Changing with the current ChromeDriver version recommended.", + required=False, + type=str, + default=os.getenv("SCRAPER_USER_AGENT", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36") + ) + + args = parser.parse_args() + + server_config = ServerConfig(host=args.host, + port=args.port) + + scraper_config = ScraperConfig(wait_time=args.wait, + headless=args.headless, + user_agent=args.user_agent) + + return server_config, scraper_config diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..49bdd48 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,20 @@ +services: + chromedriver-http-proxy: + image: ghcr.io/s4wyer/chromedriver-http-proxy:latest + container_name: chromedriver-http-proxy + + ports: + - "32323:32323" + + environment: + - PROXY_PORT=32323 + - PROXY_HOST=0.0.0.0 + - SCRAPER_WAIT_TIME=10 + - SCRAPER_HEADLESS=True + - SCRAPER_USER_AGENT="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36" + + # Increase shared memory size to prevent Chromium from crashing. + # Recommended by Selenium https://hub.docker.com/r/selenium/standalone-chrome + shm_size: '2gb' + + restart: unless-stopped diff --git a/proxy.py b/proxy.py index c86bc5f..f88a4f4 100644 --- a/proxy.py +++ b/proxy.py @@ -1,6 +1,8 @@ import time -from dataclasses import dataclass -import argparse +import atexit +import logging + +from config import get_configs, ScraperConfig import undetected_chromedriver as uc from selenium.webdriver.support.ui import WebDriverWait @@ -9,120 +11,57 @@ from selenium.webdriver import ChromeOptions from flask import Flask, request -@dataclass -class ScraperConfig: - wait_time: float - headless: bool - user_agent: str - - -@dataclass -class ServerConfig: - host: str - port: int - - class Scraper: def __init__(self, config: ScraperConfig): self.config = config self.driver = None - - def __enter__(self): self._setup_driver() - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self._cleanup() - - def _cleanup(self): - driver = self.driver - driver.close() - driver.quit() def _setup_driver(self): - headless = self.config.headless - user_agent = self.config.user_agent - chrome_options = ChromeOptions() - chrome_options.add_argument(f"--user-agent={user_agent}") + chrome_options.add_argument(f"--user-agent={self.config.user_agent}") self.driver = uc.Chrome( - headless=headless, + headless=self.config.headless, options=chrome_options, use_subprocess=False ) + logger.info("Driver started.") + + def cleanup(self): + if self.driver: + try: + self.driver.quit() + logger.info("Driver closed.") + except Exception as e: + logger.error(f"Exception during cleanup: {e}") + finally: + self.driver = None + def render_page(self, url): - wait_time = self.config.wait_time - driver = self.driver + logger.info(f"Fetching {url}...") + self.driver.get(url) - driver.get(url) - - WebDriverWait(self.driver, wait_time).until( + WebDriverWait(self.driver, timeout=self.config.wait_time).until( lambda driver: driver.execute_script("return document.readyState") == "complete" ) - time.sleep(wait_time) + time.sleep(self.config.wait_time) + + logger.info(f"Fetched {url}.") return self.driver.page_source if __name__ == "__main__": - parser = argparse.ArgumentParser(prog="ChromeDriver HTTP Proxy", - description="Simple HTTP proxy that renders pages with undetected-chromedriver and returns the HTML", - usage="") - parser.add_argument( - "--port", - help="Port the proxy runs on.", - required=False, - type=int, - default=32323 - ) + # logging.basicConfig(level=logging.INFO) + logger = logging.getLogger(__name__) + server_config, scraper_config = get_configs() - parser.add_argument( - "--host", - help="Host the proxy to runs on.", - required=False, - type=str, - default="0.0.0.0" - ) + scraper = Scraper(scraper_config) - parser.add_argument( - "--wait", - help="Seconds to wait before returning content.", - required=False, - type=float, - default=10 - ) - - parser.add_argument( - "--headless", - help="Whether or not to run Chrome headless.", - required=False, - type=bool, - default=True - ) - - parser.add_argument( - "--user-agent", - help="Chrome user agent. Changing with the current ChromeDriver version recommended.", - required=False, - type=str, - default="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36" - - ) - - args = parser.parse_args() - - port = args.port - host = args.host - - wait = args.wait - headless = args.headless - user_agent = args.user_agent - - server_config = ServerConfig(host=host, port=port) - scraper_config = ScraperConfig(wait_time=wait, headless=headless, user_agent=user_agent) + atexit.register(scraper.cleanup) # run the server app = Flask(__name__) @@ -130,11 +69,12 @@ if __name__ == "__main__": @app.route("/") def proxy_route(): url = request.args.get("url") - with Scraper(scraper_config) as scraper: - try: - html = scraper.render_page(url) - return html - except Exception as e: - print(f"Error: {e}") + + try: + html = scraper.render_page(url) + return html + logger.info(f"Successfully sent {url} to client.") + except Exception as e: + logger.error(f"Error sending {url} to client: {e}", 500) app.run(host=server_config.host, port=server_config.port) diff --git a/pyproject.toml b/pyproject.toml index 324808e..3e9a68b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,6 +3,8 @@ name = "playwright-http-proxy" version = "0.1.0" description = "Add your description here" readme = "README.md" +license = "AGPL-3.0-only" +license-files = ["LICEN[CS]E*"] requires-python = ">=3.13" dependencies = [ "argparse>=1.4.0",