diff --git a/.dockerignore b/.dockerignore deleted file mode 100644 index d0e2168..0000000 --- a/.dockerignore +++ /dev/null @@ -1,12 +0,0 @@ -# Python-generated files -__pycache__/ -*.py[oc] -build/ -dist/ -wheels/ -*.egg-info - -# Virtual environments -.venv - -.git diff --git a/.github/workflows/build-and-publish.yml b/.github/workflows/build-and-publish.yml deleted file mode 100644 index 7ed9787..0000000 --- a/.github/workflows/build-and-publish.yml +++ /dev/null @@ -1,57 +0,0 @@ -name: Build and Push Docker Image - -on: - push: - branches: [ "main", "master" ] - -jobs: - build: - runs-on: ubuntu-latest - permissions: - contents: read - packages: write - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Log in to Docker Hub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - - - name: Log in to GHCR - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Extract metadata (tags, labels) for Docker - id: meta - uses: docker/metadata-action@v5 - with: - images: | - ${{ secrets.DOCKERHUB_USERNAME }}/${{ github.event.repository.name }} - ghcr.io/${{ github.repository }} - tags: | - type=ref,event=branch - type=semver,pattern={{version}} - type=semver,pattern={{major}}.{{minor}} - type=raw,value=latest,enable={{is_default_branch}} - - - name: Build and push Docker image - uses: docker/build-push-action@v5 - with: - context: . - file: ./Dockerfile - platforms: linux/amd64 - push: true - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - cache-from: type=gha - cache-to: type=gha,mode=max diff --git a/BUILD.md b/BUILD.md deleted file mode 100644 index f9a8989..0000000 --- a/BUILD.md +++ /dev/null @@ -1,9 +0,0 @@ -# Build (Docker) - -Also works with Podman. - -```sh -docker build -t 'chromedriver-http-proxy' . -docker run --rm -p "32323:32323" chromedriver-http-proxy -``` - diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index fed03b5..0000000 --- a/Dockerfile +++ /dev/null @@ -1,30 +0,0 @@ -FROM ghcr.io/astral-sh/uv:debian-slim - -ENV PROXY_PORT=32323 -ENV PROXY_HOST=0.0.0.0 -ENV SCRAPER_WAIT_TIME=10 -ENV SCRAPER_HEADLESS=True -ENV SCRAPER_USER_AGENT="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36" - -WORKDIR /app -COPY uv.lock pyproject.toml /app -RUN uv sync --locked - -RUN apt-get update && apt-get install -y \ - wget \ - gnupg \ - ca-certificates \ - && wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - \ - && echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list \ - && apt-get update \ - && apt-get install -y google-chrome-stable \ - && rm -rf /var/cache/apt/archives /var/lib/apt/lists/* - -COPY . /app - -CMD uv run proxy.py \ - --port="$PROXY_PORT"\ - --host="$PROXY_HOST"\ - --wait="$SCRAPER_WAIT_TIME" \ - --headless="$SCRAPER_HEADLESS" \ - --user-agent="$SCRAPER_USER_AGENT" diff --git a/README.md b/README.md index c9b9a24..d0d6c88 100644 --- a/README.md +++ b/README.md @@ -6,21 +6,11 @@ Simple HTTP proxy that renders pages with undetected-chromedriver and returns th - Solves [Anubis](https://anubis.techaro.lol/) - Solves [go-away](https://git.gammaspectra.live/git/go-away) -- Solves similiar [POW challenges](https://git.gammaspectra.live/git/go-away#other-similar-projects) +- Solves similiar POW challenges - Sometimes bypasses Cloudflare Turnstile ## Installation -### Container - -```sh -docker run --rm -p "32323:32323" ghcr.io/s4wyer/chromedriver-http-proxy # or s44wyer/chromedriver-http-proxy -``` - -There's also a [Docker compose example](/docker-compose.yml) with better config. - -### System - uv: ```sh @@ -62,7 +52,7 @@ This proxy has no authentication, and I don't plan to add any (PRs welcome thoug ## TODO -- [ ] ARM Docker images +- [ ] Docker image - [ ] Send JS/CSS to the client - [ ] Custom Chromium binary locations - [ ] More CLI arguments to control ChromeDriver behavior @@ -70,7 +60,6 @@ This proxy has no authentication, and I don't plan to add any (PRs welcome thoug - [ ] Screenshot endpoint - [ ] Allow custom headers - [ ] POST requests -- [x] Docker image ## Similiar Projects diff --git a/config.py b/config.py deleted file mode 100644 index 2ad821c..0000000 --- a/config.py +++ /dev/null @@ -1,72 +0,0 @@ -import os -import argparse -from dataclasses import dataclass - - -@dataclass -class ScraperConfig: - wait_time: float - headless: bool - user_agent: str - - -@dataclass -class ServerConfig: - host: str - port: int - - -def get_configs(): - parser = argparse.ArgumentParser(prog="ChromeDriver HTTP Proxy", - description="Simple HTTP proxy that renders pages with undetected-chromedriver and returns the HTML", - usage="") - parser.add_argument( - "--port", - help="Port the proxy runs on.", - required=False, - type=int, - default=os.getenv("PROXY_PORT", 32323) - ) - - parser.add_argument( - "--host", - help="Host the proxy to runs on.", - required=False, - type=str, - default=os.getenv("PROXY_HOST", "0.0.0.0") - ) - - parser.add_argument( - "--wait", - help="Seconds to wait before returning content.", - required=False, - type=float, - default=os.getenv("SCRAPER_WAIT_TIME", 10) - ) - - parser.add_argument( - "--headless", - help="Whether or not to run Chrome headless.", - required=False, - type=bool, - default=os.getenv("SCRAPER_HEADLESS", True) - ) - - parser.add_argument( - "--user-agent", - help="Chrome user agent. Changing with the current ChromeDriver version recommended.", - required=False, - type=str, - default=os.getenv("SCRAPER_USER_AGENT", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36") - ) - - args = parser.parse_args() - - server_config = ServerConfig(host=args.host, - port=args.port) - - scraper_config = ScraperConfig(wait_time=args.wait, - headless=args.headless, - user_agent=args.user_agent) - - return server_config, scraper_config diff --git a/docker-compose.yml b/docker-compose.yml deleted file mode 100644 index 49bdd48..0000000 --- a/docker-compose.yml +++ /dev/null @@ -1,20 +0,0 @@ -services: - chromedriver-http-proxy: - image: ghcr.io/s4wyer/chromedriver-http-proxy:latest - container_name: chromedriver-http-proxy - - ports: - - "32323:32323" - - environment: - - PROXY_PORT=32323 - - PROXY_HOST=0.0.0.0 - - SCRAPER_WAIT_TIME=10 - - SCRAPER_HEADLESS=True - - SCRAPER_USER_AGENT="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36" - - # Increase shared memory size to prevent Chromium from crashing. - # Recommended by Selenium https://hub.docker.com/r/selenium/standalone-chrome - shm_size: '2gb' - - restart: unless-stopped diff --git a/proxy.py b/proxy.py index f88a4f4..c86bc5f 100644 --- a/proxy.py +++ b/proxy.py @@ -1,8 +1,6 @@ import time -import atexit -import logging - -from config import get_configs, ScraperConfig +from dataclasses import dataclass +import argparse import undetected_chromedriver as uc from selenium.webdriver.support.ui import WebDriverWait @@ -11,57 +9,120 @@ from selenium.webdriver import ChromeOptions from flask import Flask, request +@dataclass +class ScraperConfig: + wait_time: float + headless: bool + user_agent: str + + +@dataclass +class ServerConfig: + host: str + port: int + + class Scraper: def __init__(self, config: ScraperConfig): self.config = config self.driver = None + + def __enter__(self): self._setup_driver() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self._cleanup() + + def _cleanup(self): + driver = self.driver + driver.close() + driver.quit() def _setup_driver(self): + headless = self.config.headless + user_agent = self.config.user_agent + chrome_options = ChromeOptions() - chrome_options.add_argument(f"--user-agent={self.config.user_agent}") + chrome_options.add_argument(f"--user-agent={user_agent}") self.driver = uc.Chrome( - headless=self.config.headless, + headless=headless, options=chrome_options, use_subprocess=False ) - logger.info("Driver started.") - - def cleanup(self): - if self.driver: - try: - self.driver.quit() - logger.info("Driver closed.") - except Exception as e: - logger.error(f"Exception during cleanup: {e}") - finally: - self.driver = None - def render_page(self, url): - logger.info(f"Fetching {url}...") - self.driver.get(url) + wait_time = self.config.wait_time + driver = self.driver - WebDriverWait(self.driver, timeout=self.config.wait_time).until( + driver.get(url) + + WebDriverWait(self.driver, wait_time).until( lambda driver: driver.execute_script("return document.readyState") == "complete" ) - time.sleep(self.config.wait_time) - - logger.info(f"Fetched {url}.") + time.sleep(wait_time) return self.driver.page_source if __name__ == "__main__": - # logging.basicConfig(level=logging.INFO) - logger = logging.getLogger(__name__) - server_config, scraper_config = get_configs() + parser = argparse.ArgumentParser(prog="ChromeDriver HTTP Proxy", + description="Simple HTTP proxy that renders pages with undetected-chromedriver and returns the HTML", + usage="") + parser.add_argument( + "--port", + help="Port the proxy runs on.", + required=False, + type=int, + default=32323 + ) - scraper = Scraper(scraper_config) + parser.add_argument( + "--host", + help="Host the proxy to runs on.", + required=False, + type=str, + default="0.0.0.0" + ) - atexit.register(scraper.cleanup) + parser.add_argument( + "--wait", + help="Seconds to wait before returning content.", + required=False, + type=float, + default=10 + ) + + parser.add_argument( + "--headless", + help="Whether or not to run Chrome headless.", + required=False, + type=bool, + default=True + ) + + parser.add_argument( + "--user-agent", + help="Chrome user agent. Changing with the current ChromeDriver version recommended.", + required=False, + type=str, + default="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36" + + ) + + args = parser.parse_args() + + port = args.port + host = args.host + + wait = args.wait + headless = args.headless + user_agent = args.user_agent + + server_config = ServerConfig(host=host, port=port) + scraper_config = ScraperConfig(wait_time=wait, headless=headless, user_agent=user_agent) # run the server app = Flask(__name__) @@ -69,12 +130,11 @@ if __name__ == "__main__": @app.route("/") def proxy_route(): url = request.args.get("url") - - try: - html = scraper.render_page(url) - return html - logger.info(f"Successfully sent {url} to client.") - except Exception as e: - logger.error(f"Error sending {url} to client: {e}", 500) + with Scraper(scraper_config) as scraper: + try: + html = scraper.render_page(url) + return html + except Exception as e: + print(f"Error: {e}") app.run(host=server_config.host, port=server_config.port) diff --git a/pyproject.toml b/pyproject.toml index 3e9a68b..324808e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,8 +3,6 @@ name = "playwright-http-proxy" version = "0.1.0" description = "Add your description here" readme = "README.md" -license = "AGPL-3.0-only" -license-files = ["LICEN[CS]E*"] requires-python = ">=3.13" dependencies = [ "argparse>=1.4.0",