initialize
This commit is contained in:
commit
c604f5e4de
6 changed files with 635 additions and 0 deletions
140
proxy.py
Normal file
140
proxy.py
Normal file
|
@ -0,0 +1,140 @@
|
|||
import time
|
||||
from dataclasses import dataclass
|
||||
import argparse
|
||||
|
||||
import undetected_chromedriver as uc
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver import ChromeOptions
|
||||
|
||||
from flask import Flask, request
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScraperConfig:
|
||||
wait_time: float
|
||||
headless: bool
|
||||
user_agent: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class ServerConfig:
|
||||
host: str
|
||||
port: int
|
||||
|
||||
|
||||
class Scraper:
|
||||
def __init__(self, config: ScraperConfig):
|
||||
self.config = config
|
||||
self.driver = None
|
||||
|
||||
def __enter__(self):
|
||||
self._setup_driver()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self._cleanup()
|
||||
|
||||
def _cleanup(self):
|
||||
driver = self.driver
|
||||
driver.close()
|
||||
driver.quit()
|
||||
|
||||
def _setup_driver(self):
|
||||
headless = self.config.headless
|
||||
user_agent = self.config.user_agent
|
||||
|
||||
chrome_options = ChromeOptions()
|
||||
chrome_options.add_argument(f"--user-agent={user_agent}")
|
||||
|
||||
self.driver = uc.Chrome(
|
||||
headless=headless,
|
||||
options=chrome_options,
|
||||
use_subprocess=False
|
||||
)
|
||||
|
||||
def render_page(self, url):
|
||||
wait_time = self.config.wait_time
|
||||
driver = self.driver
|
||||
|
||||
driver.get(url)
|
||||
|
||||
WebDriverWait(self.driver, wait_time).until(
|
||||
lambda driver: driver.execute_script("return document.readyState") == "complete"
|
||||
)
|
||||
|
||||
time.sleep(wait_time)
|
||||
|
||||
return self.driver.page_source
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(prog="ChromeDriver HTTP Proxy",
|
||||
description="Simple HTTP proxy that renders pages with undetected-chromedriver and returns the HTML",
|
||||
usage="")
|
||||
parser.add_argument(
|
||||
"--port",
|
||||
help="Port the proxy runs on.",
|
||||
required=False,
|
||||
type=int,
|
||||
default=32323
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--host",
|
||||
help="Host the proxy to runs on.",
|
||||
required=False,
|
||||
type=str,
|
||||
default="0.0.0.0"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--wait",
|
||||
help="Seconds to wait before returning content.",
|
||||
required=False,
|
||||
type=float,
|
||||
default=10
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--headless",
|
||||
help="Whether or not to run Chrome headless.",
|
||||
required=False,
|
||||
type=bool,
|
||||
default=True
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--user-agent",
|
||||
help="Chrome user agent. Changing with the current ChromeDriver version recommended.",
|
||||
required=False,
|
||||
type=str,
|
||||
default="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"
|
||||
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
port = args.port
|
||||
host = args.host
|
||||
|
||||
wait = args.wait
|
||||
headless = args.headless
|
||||
user_agent = args.user_agent
|
||||
|
||||
server_config = ServerConfig(host=host, port=port)
|
||||
scraper_config = ScraperConfig(wait_time=wait, headless=headless, user_agent=user_agent)
|
||||
|
||||
# run the server
|
||||
app = Flask(__name__)
|
||||
|
||||
@app.route("/")
|
||||
def proxy_route():
|
||||
url = request.args.get("url")
|
||||
with Scraper(scraper_config) as scraper:
|
||||
try:
|
||||
html = scraper.render_page(url)
|
||||
return html
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
app.run(host=server_config.host, port=server_config.port)
|
Loading…
Add table
Add a link
Reference in a new issue