From 548080921213c2e8a6e195d6919d06e0e8514263 Mon Sep 17 00:00:00 2001 From: Sawyer Date: Wed, 23 Jul 2025 13:21:59 -0500 Subject: [PATCH 01/10] chore(license): add license to pyproject.toml --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 324808e..3e9a68b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,6 +3,8 @@ name = "playwright-http-proxy" version = "0.1.0" description = "Add your description here" readme = "README.md" +license = "AGPL-3.0-only" +license-files = ["LICEN[CS]E*"] requires-python = ">=3.13" dependencies = [ "argparse>=1.4.0", From ca91190291750d0293543bb9e0c7bd5f0da67e9a Mon Sep 17 00:00:00 2001 From: Sawyer Date: Fri, 25 Jul 2025 00:35:20 -0500 Subject: [PATCH 02/10] style: use self.config directly instead of assigning a variable --- proxy.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/proxy.py b/proxy.py index c86bc5f..18c87fe 100644 --- a/proxy.py +++ b/proxy.py @@ -40,14 +40,11 @@ class Scraper: driver.quit() def _setup_driver(self): - headless = self.config.headless - user_agent = self.config.user_agent - chrome_options = ChromeOptions() - chrome_options.add_argument(f"--user-agent={user_agent}") + chrome_options.add_argument(f"--user-agent={self.config.user_agent}") self.driver = uc.Chrome( - headless=headless, + headless=self.config.headless, options=chrome_options, use_subprocess=False ) From 0dfec4d269f163e0028e2826da1bc07b38b37c33 Mon Sep 17 00:00:00 2001 From: Sawyer Date: Fri, 25 Jul 2025 01:38:09 -0500 Subject: [PATCH 03/10] perf: keep the same instance of chromedriver open rather than open a new one - remove context manager implementation - register a cleanup handler - send a 500 error if selenium throws an exception --- proxy.py | 53 +++++++++++++++++++++++------------------------------ 1 file changed, 23 insertions(+), 30 deletions(-) diff --git a/proxy.py b/proxy.py index 18c87fe..0305e5a 100644 --- a/proxy.py +++ b/proxy.py @@ -1,6 +1,7 @@ import time from dataclasses import dataclass import argparse +import atexit import undetected_chromedriver as uc from selenium.webdriver.support.ui import WebDriverWait @@ -26,18 +27,7 @@ class Scraper: def __init__(self, config: ScraperConfig): self.config = config self.driver = None - - def __enter__(self): self._setup_driver() - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self._cleanup() - - def _cleanup(self): - driver = self.driver - driver.close() - driver.quit() def _setup_driver(self): chrome_options = ChromeOptions() @@ -49,17 +39,23 @@ class Scraper: use_subprocess=False ) + def cleanup(self): + if self.driver: + try: + self.driver.quit() + except Exception as e: + print(f"Error during cleanup: {e}") + finally: + self.driver = None + def render_page(self, url): - wait_time = self.config.wait_time - driver = self.driver + self.driver.get(url) - driver.get(url) - - WebDriverWait(self.driver, wait_time).until( + WebDriverWait(self.driver, timeout=self.config.wait_time).until( lambda driver: driver.execute_script("return document.readyState") == "complete" ) - time.sleep(wait_time) + time.sleep(self.config.wait_time) return self.driver.page_source @@ -111,15 +107,12 @@ if __name__ == "__main__": args = parser.parse_args() - port = args.port - host = args.host + server_config = ServerConfig(host=args.host, port=args.port) + scraper_config = ScraperConfig(wait_time=args.wait, headless=args.headless, user_agent=args.user_agent) - wait = args.wait - headless = args.headless - user_agent = args.user_agent + scraper = Scraper(scraper_config) - server_config = ServerConfig(host=host, port=port) - scraper_config = ScraperConfig(wait_time=wait, headless=headless, user_agent=user_agent) + atexit.register(scraper.cleanup) # run the server app = Flask(__name__) @@ -127,11 +120,11 @@ if __name__ == "__main__": @app.route("/") def proxy_route(): url = request.args.get("url") - with Scraper(scraper_config) as scraper: - try: - html = scraper.render_page(url) - return html - except Exception as e: - print(f"Error: {e}") + + try: + html = scraper.render_page(url) + return html + except Exception as e: + print(f"Error: {e}", 500) app.run(host=server_config.host, port=server_config.port) From 36a0c0fe60ae481f05e5e88ee39353ab4a889193 Mon Sep 17 00:00:00 2001 From: Sawyer Date: Fri, 25 Jul 2025 01:49:58 -0500 Subject: [PATCH 04/10] refactor: move config to a seperate file --- config.py | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ proxy.py | 66 +++------------------------------------------------ 2 files changed, 74 insertions(+), 63 deletions(-) create mode 100644 config.py diff --git a/config.py b/config.py new file mode 100644 index 0000000..ea28e5e --- /dev/null +++ b/config.py @@ -0,0 +1,71 @@ +import argparse +from dataclasses import dataclass + + +@dataclass +class ScraperConfig: + wait_time: float + headless: bool + user_agent: str + + +@dataclass +class ServerConfig: + host: str + port: int + + +def get_configs(): + parser = argparse.ArgumentParser(prog="ChromeDriver HTTP Proxy", + description="Simple HTTP proxy that renders pages with undetected-chromedriver and returns the HTML", + usage="") + parser.add_argument( + "--port", + help="Port the proxy runs on.", + required=False, + type=int, + default=32323 + ) + + parser.add_argument( + "--host", + help="Host the proxy to runs on.", + required=False, + type=str, + default="0.0.0.0" + ) + + parser.add_argument( + "--wait", + help="Seconds to wait before returning content.", + required=False, + type=float, + default=10 + ) + + parser.add_argument( + "--headless", + help="Whether or not to run Chrome headless.", + required=False, + type=bool, + default=True + ) + + parser.add_argument( + "--user-agent", + help="Chrome user agent. Changing with the current ChromeDriver version recommended.", + required=False, + type=str, + default="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36" + ) + + args = parser.parse_args() + + server_config = ServerConfig(host=args.host, + port=args.port) + + scraper_config = ScraperConfig(wait_time=args.wait, + headless=args.headless, + user_agent=args.user_agent) + + return server_config, scraper_config diff --git a/proxy.py b/proxy.py index 0305e5a..f3b14a6 100644 --- a/proxy.py +++ b/proxy.py @@ -1,8 +1,8 @@ import time -from dataclasses import dataclass -import argparse import atexit +from config import get_configs, ScraperConfig + import undetected_chromedriver as uc from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver import ChromeOptions @@ -10,19 +10,6 @@ from selenium.webdriver import ChromeOptions from flask import Flask, request -@dataclass -class ScraperConfig: - wait_time: float - headless: bool - user_agent: str - - -@dataclass -class ServerConfig: - host: str - port: int - - class Scraper: def __init__(self, config: ScraperConfig): self.config = config @@ -61,54 +48,7 @@ class Scraper: if __name__ == "__main__": - parser = argparse.ArgumentParser(prog="ChromeDriver HTTP Proxy", - description="Simple HTTP proxy that renders pages with undetected-chromedriver and returns the HTML", - usage="") - parser.add_argument( - "--port", - help="Port the proxy runs on.", - required=False, - type=int, - default=32323 - ) - - parser.add_argument( - "--host", - help="Host the proxy to runs on.", - required=False, - type=str, - default="0.0.0.0" - ) - - parser.add_argument( - "--wait", - help="Seconds to wait before returning content.", - required=False, - type=float, - default=10 - ) - - parser.add_argument( - "--headless", - help="Whether or not to run Chrome headless.", - required=False, - type=bool, - default=True - ) - - parser.add_argument( - "--user-agent", - help="Chrome user agent. Changing with the current ChromeDriver version recommended.", - required=False, - type=str, - default="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36" - - ) - - args = parser.parse_args() - - server_config = ServerConfig(host=args.host, port=args.port) - scraper_config = ScraperConfig(wait_time=args.wait, headless=args.headless, user_agent=args.user_agent) + server_config, scraper_config = get_configs() scraper = Scraper(scraper_config) From 997ab17dd8470f7a152faad36aff0d0689f3ad9c Mon Sep 17 00:00:00 2001 From: Sawyer Date: Fri, 25 Jul 2025 02:03:38 -0500 Subject: [PATCH 05/10] feat: replace print statements with logging --- proxy.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/proxy.py b/proxy.py index f3b14a6..f88a4f4 100644 --- a/proxy.py +++ b/proxy.py @@ -1,5 +1,6 @@ import time import atexit +import logging from config import get_configs, ScraperConfig @@ -26,16 +27,20 @@ class Scraper: use_subprocess=False ) + logger.info("Driver started.") + def cleanup(self): if self.driver: try: self.driver.quit() + logger.info("Driver closed.") except Exception as e: - print(f"Error during cleanup: {e}") + logger.error(f"Exception during cleanup: {e}") finally: self.driver = None def render_page(self, url): + logger.info(f"Fetching {url}...") self.driver.get(url) WebDriverWait(self.driver, timeout=self.config.wait_time).until( @@ -44,10 +49,14 @@ class Scraper: time.sleep(self.config.wait_time) + logger.info(f"Fetched {url}.") + return self.driver.page_source if __name__ == "__main__": + # logging.basicConfig(level=logging.INFO) + logger = logging.getLogger(__name__) server_config, scraper_config = get_configs() scraper = Scraper(scraper_config) @@ -64,7 +73,8 @@ if __name__ == "__main__": try: html = scraper.render_page(url) return html + logger.info(f"Successfully sent {url} to client.") except Exception as e: - print(f"Error: {e}", 500) + logger.error(f"Error sending {url} to client: {e}", 500) app.run(host=server_config.host, port=server_config.port) From 244aea41cc2531e7c6c49a1ab6c26cb4c464f732 Mon Sep 17 00:00:00 2001 From: Sawyer Date: Tue, 5 Aug 2025 18:47:06 -0500 Subject: [PATCH 06/10] feat(config): allow configuration with environment variables --- config.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/config.py b/config.py index ea28e5e..2ad821c 100644 --- a/config.py +++ b/config.py @@ -1,3 +1,4 @@ +import os import argparse from dataclasses import dataclass @@ -24,7 +25,7 @@ def get_configs(): help="Port the proxy runs on.", required=False, type=int, - default=32323 + default=os.getenv("PROXY_PORT", 32323) ) parser.add_argument( @@ -32,7 +33,7 @@ def get_configs(): help="Host the proxy to runs on.", required=False, type=str, - default="0.0.0.0" + default=os.getenv("PROXY_HOST", "0.0.0.0") ) parser.add_argument( @@ -40,7 +41,7 @@ def get_configs(): help="Seconds to wait before returning content.", required=False, type=float, - default=10 + default=os.getenv("SCRAPER_WAIT_TIME", 10) ) parser.add_argument( @@ -48,7 +49,7 @@ def get_configs(): help="Whether or not to run Chrome headless.", required=False, type=bool, - default=True + default=os.getenv("SCRAPER_HEADLESS", True) ) parser.add_argument( @@ -56,7 +57,7 @@ def get_configs(): help="Chrome user agent. Changing with the current ChromeDriver version recommended.", required=False, type=str, - default="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36" + default=os.getenv("SCRAPER_USER_AGENT", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36") ) args = parser.parse_args() From 271897f18f81cc49c071a92a240aad4963e03523 Mon Sep 17 00:00:00 2001 From: Sawyer Date: Tue, 5 Aug 2025 19:27:37 -0500 Subject: [PATCH 07/10] feat(container): add container builds --- .dockerignore | 12 ++++++ .github/workflows/build-and-publish.yml | 57 +++++++++++++++++++++++++ BUILD.md | 9 ++++ Dockerfile | 30 +++++++++++++ README.md | 3 +- 5 files changed, 110 insertions(+), 1 deletion(-) create mode 100644 .dockerignore create mode 100644 .github/workflows/build-and-publish.yml create mode 100644 BUILD.md create mode 100644 Dockerfile diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..d0e2168 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,12 @@ +# Python-generated files +__pycache__/ +*.py[oc] +build/ +dist/ +wheels/ +*.egg-info + +# Virtual environments +.venv + +.git diff --git a/.github/workflows/build-and-publish.yml b/.github/workflows/build-and-publish.yml new file mode 100644 index 0000000..8502c1d --- /dev/null +++ b/.github/workflows/build-and-publish.yml @@ -0,0 +1,57 @@ +name: Build and Push Docker Image + +on: + push: + branches: [ "main", "master" ] + +jobs: + build: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Log in to GHCR + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GHCR_TOKEN }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v5 + with: + images: | + ${{ secrets.DOCKERHUB_USERNAME }}/${{ github.event.repository.name }} + ghcr.io/${{ github.repository }} + tags: | + type=ref,event=branch + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=raw,value=latest,enable={{is_default_branch}} + + - name: Build and push Docker image + uses: docker/build-push-action@v5 + with: + context: . + file: ./Dockerfile + platforms: linux/amd64 + push: ${{ github.event_name }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max diff --git a/BUILD.md b/BUILD.md new file mode 100644 index 0000000..f9a8989 --- /dev/null +++ b/BUILD.md @@ -0,0 +1,9 @@ +# Build (Docker) + +Also works with Podman. + +```sh +docker build -t 'chromedriver-http-proxy' . +docker run --rm -p "32323:32323" chromedriver-http-proxy +``` + diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..fed03b5 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,30 @@ +FROM ghcr.io/astral-sh/uv:debian-slim + +ENV PROXY_PORT=32323 +ENV PROXY_HOST=0.0.0.0 +ENV SCRAPER_WAIT_TIME=10 +ENV SCRAPER_HEADLESS=True +ENV SCRAPER_USER_AGENT="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36" + +WORKDIR /app +COPY uv.lock pyproject.toml /app +RUN uv sync --locked + +RUN apt-get update && apt-get install -y \ + wget \ + gnupg \ + ca-certificates \ + && wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - \ + && echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list \ + && apt-get update \ + && apt-get install -y google-chrome-stable \ + && rm -rf /var/cache/apt/archives /var/lib/apt/lists/* + +COPY . /app + +CMD uv run proxy.py \ + --port="$PROXY_PORT"\ + --host="$PROXY_HOST"\ + --wait="$SCRAPER_WAIT_TIME" \ + --headless="$SCRAPER_HEADLESS" \ + --user-agent="$SCRAPER_USER_AGENT" diff --git a/README.md b/README.md index d0d6c88..92de96b 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ This proxy has no authentication, and I don't plan to add any (PRs welcome thoug ## TODO -- [ ] Docker image +- [ ] ARM Docker images - [ ] Send JS/CSS to the client - [ ] Custom Chromium binary locations - [ ] More CLI arguments to control ChromeDriver behavior @@ -60,6 +60,7 @@ This proxy has no authentication, and I don't plan to add any (PRs welcome thoug - [ ] Screenshot endpoint - [ ] Allow custom headers - [ ] POST requests +- [x] Docker image ## Similiar Projects From c4eba52ba841d0ab895ff48545bb6b82022bb803 Mon Sep 17 00:00:00 2001 From: Sawyer Date: Tue, 5 Aug 2025 19:31:54 -0500 Subject: [PATCH 08/10] fix(ci): fix failing build --- .github/workflows/build-and-publish.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-and-publish.yml b/.github/workflows/build-and-publish.yml index 8502c1d..7ed9787 100644 --- a/.github/workflows/build-and-publish.yml +++ b/.github/workflows/build-and-publish.yml @@ -29,7 +29,7 @@ jobs: with: registry: ghcr.io username: ${{ github.actor }} - password: ${{ secrets.GHCR_TOKEN }} + password: ${{ secrets.GITHUB_TOKEN }} - name: Extract metadata (tags, labels) for Docker id: meta @@ -50,7 +50,7 @@ jobs: context: . file: ./Dockerfile platforms: linux/amd64 - push: ${{ github.event_name }} + push: true tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} cache-from: type=gha From 6698ac016cb58e2caa60d0804ca83f637e0e61de Mon Sep 17 00:00:00 2001 From: Sawyer Date: Tue, 5 Aug 2025 19:42:52 -0500 Subject: [PATCH 09/10] feat(container): add docker compose and docker docs --- README.md | 10 ++++++++++ docker-compose.yml | 20 ++++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 docker-compose.yml diff --git a/README.md b/README.md index 92de96b..e61f726 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,16 @@ Simple HTTP proxy that renders pages with undetected-chromedriver and returns th ## Installation +### Container + +```sh +docker run --rm -p "32323:32323" ghcr.io/s4wyer/chromedriver-http-proxy # or s44wyer/chromedriver-http-proxy +``` + +There's also a [Docker compose example](/docker-compose.yml) with better config. + +### System + uv: ```sh diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..49bdd48 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,20 @@ +services: + chromedriver-http-proxy: + image: ghcr.io/s4wyer/chromedriver-http-proxy:latest + container_name: chromedriver-http-proxy + + ports: + - "32323:32323" + + environment: + - PROXY_PORT=32323 + - PROXY_HOST=0.0.0.0 + - SCRAPER_WAIT_TIME=10 + - SCRAPER_HEADLESS=True + - SCRAPER_USER_AGENT="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36" + + # Increase shared memory size to prevent Chromium from crashing. + # Recommended by Selenium https://hub.docker.com/r/selenium/standalone-chrome + shm_size: '2gb' + + restart: unless-stopped From 8efbbbcf0812c2b9dee61e7b0071687be1646527 Mon Sep 17 00:00:00 2001 From: Sawyer Date: Sun, 17 Aug 2025 14:11:31 -0500 Subject: [PATCH 10/10] docs(README): add a link to the other POW challenges --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e61f726..c9b9a24 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ Simple HTTP proxy that renders pages with undetected-chromedriver and returns th - Solves [Anubis](https://anubis.techaro.lol/) - Solves [go-away](https://git.gammaspectra.live/git/go-away) -- Solves similiar POW challenges +- Solves similiar [POW challenges](https://git.gammaspectra.live/git/go-away#other-similar-projects) - Sometimes bypasses Cloudflare Turnstile ## Installation