Compare commits
No commits in common. "8efbbbcf0812c2b9dee61e7b0071687be1646527" and "52f5392ce07236dcd483bbf474c24f2f89e0cd54" have entirely different histories.
8efbbbcf08
...
52f5392ce0
9 changed files with 98 additions and 251 deletions
|
@ -1,12 +0,0 @@
|
|||
# Python-generated files
|
||||
__pycache__/
|
||||
*.py[oc]
|
||||
build/
|
||||
dist/
|
||||
wheels/
|
||||
*.egg-info
|
||||
|
||||
# Virtual environments
|
||||
.venv
|
||||
|
||||
.git
|
57
.github/workflows/build-and-publish.yml
vendored
57
.github/workflows/build-and-publish.yml
vendored
|
@ -1,57 +0,0 @@
|
|||
name: Build and Push Docker Image
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ "main", "master" ]
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
|
||||
- name: Log in to GHCR
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Extract metadata (tags, labels) for Docker
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: |
|
||||
${{ secrets.DOCKERHUB_USERNAME }}/${{ github.event.repository.name }}
|
||||
ghcr.io/${{ github.repository }}
|
||||
tags: |
|
||||
type=ref,event=branch
|
||||
type=semver,pattern={{version}}
|
||||
type=semver,pattern={{major}}.{{minor}}
|
||||
type=raw,value=latest,enable={{is_default_branch}}
|
||||
|
||||
- name: Build and push Docker image
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
file: ./Dockerfile
|
||||
platforms: linux/amd64
|
||||
push: true
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
9
BUILD.md
9
BUILD.md
|
@ -1,9 +0,0 @@
|
|||
# Build (Docker)
|
||||
|
||||
Also works with Podman.
|
||||
|
||||
```sh
|
||||
docker build -t 'chromedriver-http-proxy' .
|
||||
docker run --rm -p "32323:32323" chromedriver-http-proxy
|
||||
```
|
||||
|
30
Dockerfile
30
Dockerfile
|
@ -1,30 +0,0 @@
|
|||
FROM ghcr.io/astral-sh/uv:debian-slim
|
||||
|
||||
ENV PROXY_PORT=32323
|
||||
ENV PROXY_HOST=0.0.0.0
|
||||
ENV SCRAPER_WAIT_TIME=10
|
||||
ENV SCRAPER_HEADLESS=True
|
||||
ENV SCRAPER_USER_AGENT="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"
|
||||
|
||||
WORKDIR /app
|
||||
COPY uv.lock pyproject.toml /app
|
||||
RUN uv sync --locked
|
||||
|
||||
RUN apt-get update && apt-get install -y \
|
||||
wget \
|
||||
gnupg \
|
||||
ca-certificates \
|
||||
&& wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - \
|
||||
&& echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y google-chrome-stable \
|
||||
&& rm -rf /var/cache/apt/archives /var/lib/apt/lists/*
|
||||
|
||||
COPY . /app
|
||||
|
||||
CMD uv run proxy.py \
|
||||
--port="$PROXY_PORT"\
|
||||
--host="$PROXY_HOST"\
|
||||
--wait="$SCRAPER_WAIT_TIME" \
|
||||
--headless="$SCRAPER_HEADLESS" \
|
||||
--user-agent="$SCRAPER_USER_AGENT"
|
15
README.md
15
README.md
|
@ -6,21 +6,11 @@ Simple HTTP proxy that renders pages with undetected-chromedriver and returns th
|
|||
|
||||
- Solves [Anubis](https://anubis.techaro.lol/)
|
||||
- Solves [go-away](https://git.gammaspectra.live/git/go-away)
|
||||
- Solves similiar [POW challenges](https://git.gammaspectra.live/git/go-away#other-similar-projects)
|
||||
- Solves similiar POW challenges
|
||||
- Sometimes bypasses Cloudflare Turnstile
|
||||
|
||||
## Installation
|
||||
|
||||
### Container
|
||||
|
||||
```sh
|
||||
docker run --rm -p "32323:32323" ghcr.io/s4wyer/chromedriver-http-proxy # or s44wyer/chromedriver-http-proxy
|
||||
```
|
||||
|
||||
There's also a [Docker compose example](/docker-compose.yml) with better config.
|
||||
|
||||
### System
|
||||
|
||||
uv:
|
||||
|
||||
```sh
|
||||
|
@ -62,7 +52,7 @@ This proxy has no authentication, and I don't plan to add any (PRs welcome thoug
|
|||
|
||||
## TODO
|
||||
|
||||
- [ ] ARM Docker images
|
||||
- [ ] Docker image
|
||||
- [ ] Send JS/CSS to the client
|
||||
- [ ] Custom Chromium binary locations
|
||||
- [ ] More CLI arguments to control ChromeDriver behavior
|
||||
|
@ -70,7 +60,6 @@ This proxy has no authentication, and I don't plan to add any (PRs welcome thoug
|
|||
- [ ] Screenshot endpoint
|
||||
- [ ] Allow custom headers
|
||||
- [ ] POST requests
|
||||
- [x] Docker image
|
||||
|
||||
## Similiar Projects
|
||||
|
||||
|
|
72
config.py
72
config.py
|
@ -1,72 +0,0 @@
|
|||
import os
|
||||
import argparse
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScraperConfig:
|
||||
wait_time: float
|
||||
headless: bool
|
||||
user_agent: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class ServerConfig:
|
||||
host: str
|
||||
port: int
|
||||
|
||||
|
||||
def get_configs():
|
||||
parser = argparse.ArgumentParser(prog="ChromeDriver HTTP Proxy",
|
||||
description="Simple HTTP proxy that renders pages with undetected-chromedriver and returns the HTML",
|
||||
usage="")
|
||||
parser.add_argument(
|
||||
"--port",
|
||||
help="Port the proxy runs on.",
|
||||
required=False,
|
||||
type=int,
|
||||
default=os.getenv("PROXY_PORT", 32323)
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--host",
|
||||
help="Host the proxy to runs on.",
|
||||
required=False,
|
||||
type=str,
|
||||
default=os.getenv("PROXY_HOST", "0.0.0.0")
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--wait",
|
||||
help="Seconds to wait before returning content.",
|
||||
required=False,
|
||||
type=float,
|
||||
default=os.getenv("SCRAPER_WAIT_TIME", 10)
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--headless",
|
||||
help="Whether or not to run Chrome headless.",
|
||||
required=False,
|
||||
type=bool,
|
||||
default=os.getenv("SCRAPER_HEADLESS", True)
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--user-agent",
|
||||
help="Chrome user agent. Changing with the current ChromeDriver version recommended.",
|
||||
required=False,
|
||||
type=str,
|
||||
default=os.getenv("SCRAPER_USER_AGENT", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36")
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
server_config = ServerConfig(host=args.host,
|
||||
port=args.port)
|
||||
|
||||
scraper_config = ScraperConfig(wait_time=args.wait,
|
||||
headless=args.headless,
|
||||
user_agent=args.user_agent)
|
||||
|
||||
return server_config, scraper_config
|
|
@ -1,20 +0,0 @@
|
|||
services:
|
||||
chromedriver-http-proxy:
|
||||
image: ghcr.io/s4wyer/chromedriver-http-proxy:latest
|
||||
container_name: chromedriver-http-proxy
|
||||
|
||||
ports:
|
||||
- "32323:32323"
|
||||
|
||||
environment:
|
||||
- PROXY_PORT=32323
|
||||
- PROXY_HOST=0.0.0.0
|
||||
- SCRAPER_WAIT_TIME=10
|
||||
- SCRAPER_HEADLESS=True
|
||||
- SCRAPER_USER_AGENT="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"
|
||||
|
||||
# Increase shared memory size to prevent Chromium from crashing.
|
||||
# Recommended by Selenium https://hub.docker.com/r/selenium/standalone-chrome
|
||||
shm_size: '2gb'
|
||||
|
||||
restart: unless-stopped
|
132
proxy.py
132
proxy.py
|
@ -1,8 +1,6 @@
|
|||
import time
|
||||
import atexit
|
||||
import logging
|
||||
|
||||
from config import get_configs, ScraperConfig
|
||||
from dataclasses import dataclass
|
||||
import argparse
|
||||
|
||||
import undetected_chromedriver as uc
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
|
@ -11,57 +9,120 @@ from selenium.webdriver import ChromeOptions
|
|||
from flask import Flask, request
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScraperConfig:
|
||||
wait_time: float
|
||||
headless: bool
|
||||
user_agent: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class ServerConfig:
|
||||
host: str
|
||||
port: int
|
||||
|
||||
|
||||
class Scraper:
|
||||
def __init__(self, config: ScraperConfig):
|
||||
self.config = config
|
||||
self.driver = None
|
||||
|
||||
def __enter__(self):
|
||||
self._setup_driver()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self._cleanup()
|
||||
|
||||
def _cleanup(self):
|
||||
driver = self.driver
|
||||
driver.close()
|
||||
driver.quit()
|
||||
|
||||
def _setup_driver(self):
|
||||
headless = self.config.headless
|
||||
user_agent = self.config.user_agent
|
||||
|
||||
chrome_options = ChromeOptions()
|
||||
chrome_options.add_argument(f"--user-agent={self.config.user_agent}")
|
||||
chrome_options.add_argument(f"--user-agent={user_agent}")
|
||||
|
||||
self.driver = uc.Chrome(
|
||||
headless=self.config.headless,
|
||||
headless=headless,
|
||||
options=chrome_options,
|
||||
use_subprocess=False
|
||||
)
|
||||
|
||||
logger.info("Driver started.")
|
||||
|
||||
def cleanup(self):
|
||||
if self.driver:
|
||||
try:
|
||||
self.driver.quit()
|
||||
logger.info("Driver closed.")
|
||||
except Exception as e:
|
||||
logger.error(f"Exception during cleanup: {e}")
|
||||
finally:
|
||||
self.driver = None
|
||||
|
||||
def render_page(self, url):
|
||||
logger.info(f"Fetching {url}...")
|
||||
self.driver.get(url)
|
||||
wait_time = self.config.wait_time
|
||||
driver = self.driver
|
||||
|
||||
WebDriverWait(self.driver, timeout=self.config.wait_time).until(
|
||||
driver.get(url)
|
||||
|
||||
WebDriverWait(self.driver, wait_time).until(
|
||||
lambda driver: driver.execute_script("return document.readyState") == "complete"
|
||||
)
|
||||
|
||||
time.sleep(self.config.wait_time)
|
||||
|
||||
logger.info(f"Fetched {url}.")
|
||||
time.sleep(wait_time)
|
||||
|
||||
return self.driver.page_source
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
server_config, scraper_config = get_configs()
|
||||
parser = argparse.ArgumentParser(prog="ChromeDriver HTTP Proxy",
|
||||
description="Simple HTTP proxy that renders pages with undetected-chromedriver and returns the HTML",
|
||||
usage="")
|
||||
parser.add_argument(
|
||||
"--port",
|
||||
help="Port the proxy runs on.",
|
||||
required=False,
|
||||
type=int,
|
||||
default=32323
|
||||
)
|
||||
|
||||
scraper = Scraper(scraper_config)
|
||||
parser.add_argument(
|
||||
"--host",
|
||||
help="Host the proxy to runs on.",
|
||||
required=False,
|
||||
type=str,
|
||||
default="0.0.0.0"
|
||||
)
|
||||
|
||||
atexit.register(scraper.cleanup)
|
||||
parser.add_argument(
|
||||
"--wait",
|
||||
help="Seconds to wait before returning content.",
|
||||
required=False,
|
||||
type=float,
|
||||
default=10
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--headless",
|
||||
help="Whether or not to run Chrome headless.",
|
||||
required=False,
|
||||
type=bool,
|
||||
default=True
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--user-agent",
|
||||
help="Chrome user agent. Changing with the current ChromeDriver version recommended.",
|
||||
required=False,
|
||||
type=str,
|
||||
default="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"
|
||||
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
port = args.port
|
||||
host = args.host
|
||||
|
||||
wait = args.wait
|
||||
headless = args.headless
|
||||
user_agent = args.user_agent
|
||||
|
||||
server_config = ServerConfig(host=host, port=port)
|
||||
scraper_config = ScraperConfig(wait_time=wait, headless=headless, user_agent=user_agent)
|
||||
|
||||
# run the server
|
||||
app = Flask(__name__)
|
||||
|
@ -69,12 +130,11 @@ if __name__ == "__main__":
|
|||
@app.route("/")
|
||||
def proxy_route():
|
||||
url = request.args.get("url")
|
||||
|
||||
try:
|
||||
html = scraper.render_page(url)
|
||||
return html
|
||||
logger.info(f"Successfully sent {url} to client.")
|
||||
except Exception as e:
|
||||
logger.error(f"Error sending {url} to client: {e}", 500)
|
||||
with Scraper(scraper_config) as scraper:
|
||||
try:
|
||||
html = scraper.render_page(url)
|
||||
return html
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
app.run(host=server_config.host, port=server_config.port)
|
||||
|
|
|
@ -3,8 +3,6 @@ name = "playwright-http-proxy"
|
|||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
license = "AGPL-3.0-only"
|
||||
license-files = ["LICEN[CS]E*"]
|
||||
requires-python = ">=3.13"
|
||||
dependencies = [
|
||||
"argparse>=1.4.0",
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue