先说遇到的问题:使用selenium 的chrome driver 运行海外业务,毫无疑问的是需要海外代理的。开始的时候服务提供商提供的代理是不需要用户名和密码的。业务受限,寻找新的代理提供商的时候,遇到了一个需要使用用户名和密码的。于是乎就遇到了这样的问题,如何自动化输入用户名和密码?
原来项目使用代理的方式如下:
def getCapabilities(proxy_ip):
proxy = Proxy({
'proxyType': ProxyType.MANUAL,
'httpProxy': proxy_ip,
'sslProxy': proxy_ip,
'noProxy': ''})
capabilities = webdriver.DesiredCapabilities.CHROME
proxy.add_to_capabilities(capabilities)
return capabilities
proxy_ip = "127.0.0.1:2222" #替换为你自己的代理ip和端口
chrome_option = webdriver.ChromeOptions()
service = Service(ChromeDriverManager().install())
with webdriver.Chrome(service=service, options=chrome_option, desired_capabilities=getCapabilities(proxy_ip)) as browser:
...
起初想按照url的格式“ {username}:{password}@{proxy_ip}:{proxy_port}”,进行替换。结果发现不行?
检索资料遇到了这种使用方法:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--proxy-server=proxy_host:port')
这种方式运行后,页面会弹窗提示输入用户名和密码。填写代理的用户名和密码后,程序运行正常(此时非headless模式)。如何解决代理需要输入用户名和密码的情况那。于是找到了这个解决办法。如下图:
PROXY_HOST = ""
PROXY_PORT = # port
PROXY_USER = " " # proxy user name
PROXY_PASS = "" # proxy password
manifest_json = """
{
"version": "1.0.0",
"manifest_version": 2,
"name": "Chrome Proxy",
"permissions": [
"proxy",
"tabs",
"unlimitedStorage",
"storage",
"",
"webRequest",
"webRequestBlocking"
],
"background": {
"scripts": ["background.js"]
},
"minimum_chrome_version":"22.0.0"
}
"""
background_js = """
var config = {
mode: "fixed_servers",
rules: {
singleProxy: {
scheme: "http",
host: "%s",
port: parseInt(%s)
},
bypassList: ["localhost"]
}
};
chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});
function callbackFn(details) {
return {
authCredentials: {
username: "%s",
password: "%s"
}
};
}
chrome.webRequest.onAuthRequired.addListener(
callbackFn,
{urls: [""]},
['blocking']
);
""" % (PROXY_HOST, PROXY_PORT, PROXY_USER, PROXY_PASS)
pluginfile = 'proxy_auth_plugin.zip'
with zipfile.ZipFile(pluginfile, 'w') as zp:
zp.writestr("manifest.json", manifest_json)
zp.writestr("background.js", background_js)
chrome_options = webdriver.ChromeOptions()
chrome_options.add_extension(pluginfile)
这个的解决思路是使用chrome扩展来解决的。关于proxy 扩展的参考路径 代理扩展开发参考
这个方法上线后,发现一直报异常。对应的异常信息如下:
Traceback (most recent call last):
File "test.py", line 173, in
with webdriver.Chrome(service=service, options=options ) as browser:
File "/usr/local/lib/python3.9/site-packages/seleniumwire/webdriver.py", line 218, in __init__
super().__init__(*args, **kwargs)
File "/usr/local/lib/python3.9/site-packages/selenium/webdriver/chrome/webdriver.py", line 69, in __init__
super().__init__(DesiredCapabilities.CHROME['browserName'], "goog",
File "/usr/local/lib/python3.9/site-packages/selenium/webdriver/chromium/webdriver.py", line 92, in __init__
super().__init__(
File "/usr/local/lib/python3.9/site-packages/selenium/webdriver/remote/webdriver.py", line 270, in __init__
self.start_session(capabilities, browser_profile)
File "/usr/local/lib/python3.9/site-packages/selenium/webdriver/remote/webdriver.py", line 363, in start_session
response = self.execute(Command.NEW_SESSION, parameters)
File "/usr/local/lib/python3.9/site-packages/selenium/webdriver/remote/webdriver.py", line 428, in execute
self.error_handler.check_response(response)
File "/usr/local/lib/python3.9/site-packages/selenium/webdriver/remote/errorhandler.py", line 243, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.WebDriverException: Message: unknown error: failed to wait for extension background page to load: chrome-extension://helkdidpeohdmiknmhmfamjeepmnmlib/_generated_background_page.html
from unknown error: page could not be found: chrome-extension://helkdidpeohdmiknmhmfamjeepmnmlib/_generated_background_page.html
Stacktrace:
0 chromedriver 0x00000001046c26b8 chromedriver + 4937400
1 chromedriver 0x00000001046b9b73 chromedriver + 4901747
2 chromedriver 0x0000000104277616 chromedriver + 435734
从网上的检索信息,基本说无法按照扩展。这个时候按照异常寻找解决办法就会陷入误区。无非缘木求鱼。当时的思路就是,有没有什么办法,不用扩展的情况下,把这个工作解决掉,这样这个问题就绕过去了。根据今天的探索,观点如下:headless模式下,是没有办法使用自定义的扩展的。
于是又得到这样的答案,Create a PAC file (e.g., proxy_auth.pac) with the following content, replacing PROXY_HOST, PROXY_PORT, USERNAME, and PASSWORD with your actual proxy details:
proxy_auth.pac 文件内容。
function FindProxyForURL(url, host) {
var proxy = "PROXY PROXY_HOST:PROXY_PORT";
var auth = "Basic " + btoa("USERNAME:PASSWORD");
return "PROXY " + proxy + "; " + "Proxy-Authorization: " + auth;
}
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# Set the path to your PAC file
pac_file_path = "file:///path/to/your/proxy_auth.pac"
# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument(f"--proxy-pac-url={pac_file_path}")
# Set the path to your ChromeDriver executable
chromedriver_path = "/path/to/your/chromedriver"
# Create a new instance of Chrome with the specified options
driver = webdriver.Chrome(executable_path=chromedriver_path, options=chrome_options)
# Navigate to a website of your choice
driver.get("https://www.example.com")
# Perform any required actions on the website
# ...
# Close the browser when finished
driver.quit()
通过运行上面的代码,你会发现会很失望的。逢山开路遇水搭桥,又检索到这样一篇博客。Selenium如何使用代理
当时看完,感觉也不行,其实是博客例子给的不好,结合自己代码的时候发生了错误,也就放弃了。但是提供了seleniumwire 解决输入用户名和密码的问题。最后使用了强大的chatGpt4版本。结果给出了完整的代码。
from seleniumwire import webdriver
from selenium.webdriver.chrome.options import Options
# Set your proxy details
proxy_options = {
'http': f'http://USERNAME:PASSWORD@PROXY_HOST:PROXY_PORT',
'https': f'https://USERNAME:PASSWORD@PROXY_HOST:PROXY_PORT',
'no_proxy': 'localhost,127.0.0.1'
}
# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
# Set the path to your ChromeDriver executable
chromedriver_path = "/path/to/your/chromedriver"
# Create a new instance of Chrome with the specified options and proxy settings
driver = webdriver.Chrome(executable_path=chromedriver_path, options=chrome_options, seleniumwire_options={'proxy': proxy_options})
# Navigate to a website of your choice
driver.get("https://www.example.com")
# Perform any required actions on the website
# ...
# Close the browser when finished
driver.quit()
别忘记使用shell 安装模块
pip install selenium-wire
在解决这个问题的时候,也看到一篇非常不错的帖子。Puppeteer 无头chrome下,4种方式解决代理认证的问题 。
第一种方式:两年来,Puppeteer 一直支持使用 authenticate() 方法对代理进行身份验证的内置解决方案。如今,这是在原版 Puppeteer 中最常用的方法。
const puppeteer = require('puppeteer');
const proxy = 'http://my.proxy.com:3001';
const username = 'jimmy49';
const password = 'password123';
(async () => {
// Pass proxy URL into the --proxy-server arg
const browser = await puppeteer.launch({
args: [`--proxy-server=${proxy}`],
});
const page = await browser.newPage()
// Authenticate our proxy with username and password defined above
await page.authenticate({ username, password });
await page.goto('https://www.google.com');
await browser.close();
})();
第二种方式:proxy-chain 包是由 Apify 开发和维护的开源包,它提供了一种不同的方法,具有允许您轻松“匿名化”经过身份验证的代理的功能。 这可以通过将带有身份验证详细信息的代理 URL 传递到 proxyChain.anonymizeProxy 方法,然后在启动 Puppeteer 时在 —proxy-server 参数中使用其返回值来完成。
const puppeteer = require('puppeteer');
const proxyChain = require('proxy-chain');
const proxy = 'http://my.proxy.com:3001';
const username = 'jimmy49';
const password = 'password123';
(async () => {
const originalUrl = `http://${username}:${password}@${proxy}`;
// Return anonymized version of original URL - looks like http://127.0.0.1:45678
const newUrl = await proxyChain.anonymizeProxy(originalUrl);
const browser = await puppeteer.launch({
args: [`--proxy-server=${newProxyUrl}`],
});
const page = await browser.newPage();
await page.goto('https://www.google.com');
await browser.close();
// Close any pending connections
await proxyChain.closeAnonymizedProxy(newProxyUrl, true);
})();
第三种方式:
const Apify = require('apify');
const proxy = 'http://my.proxy.com:3001';
const username = 'jimmy49';
const password = 'password123';
Apify.main(async () => {
const requestList = await Apify.openRequestList([{ url: 'https://google.com' }]);
// Pass authenticated proxy URL into proxyUrls
const proxyConfiguration = await Apify.createProxyConfiguration({ proxyUrls: [`http://${username}:${password}@${proxy}`] });
const crawler = new Apify.PuppeteerCrawler({
requestList,
requestQueue,
// Pass proxyConfiguration into the crawler
proxyConfiguration,
handlePageFunction: async ({ page }) => {
const title = await page.title();
console.log(title);
},
});
await crawler.run();
});
第四种方式:
const puppeteer = require('puppeteer');
const proxy = 'http://my.proxy.com:3001';
const username = 'jimmy49';
const password = 'password123';
(async () => {
// Pass proxy URL into the --proxy-server arg
const browser = await puppeteer.launch({
args: [`--proxy-server=${proxy}`],
});
const page = await browser.newPage()
// Pass in our base64 encoded username and password
await page.setExtraHTTPHeaders({
'Proxy-Authorization': 'Basic ' + Buffer.from(`${username}:${password}`).toString('base64'),
});
await page.goto('https://www.google.com');
await browser.close();
})();
从上面可以清楚的看出,puppeteer在代理权限认证上的解决方式还是比较多的。之前看过pupeteer在浏览器指纹解决方案。不过还是不怎么完善。自己在他的思路上,将selenium上添加了大量的代码,结果轻松的解决了浏览器指纹问题。当然fingerprint.js可以轻松绕过。
总结:
使用扩展的方式只能解决有界面的情况下输入代理的用户名和密码的。无界面的情况下,需要使用seleniumwire 模块进行解决。 如果新项目的话,推荐使用puppeteer ,而不是selenium 。