我们正常用seleniumwire抓数据代码:
from seleniumwire import webdriver # Import from seleniumwire
from selenium.webdriver.chrome.service import Service
# 设定 chromedriver 的路径
chrome_driver_path = "/Users/cai/Documents/chromedriver-mac-x64/chromedriver" # 替换为你本地的路径
service = Service(executable_path=chrome_driver_path)
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15"
options = webdriver.ChromeOptions()
options.add_argument('--user-agent=%s' % user_agent)
# 配置 Selenium 的 WebDriver 并指定 chromedriver 的路径
driver = webdriver.Chrome(service=service,options=options)
# Go to the Google home page
driver.get('https://www.xiaocaicai.com/')
# Access requests via the `requests` attribute
for request in driver.requests:
if request.response:
if 'apm-fe.xiaocaicai.com/api/data' in request.url:
print(
request.url,
request.response.status_code,
request.response.headers['Content-Type'],
request.body
)
我们通过日志能看到,已经被检测到使用chromedriver访问了
那我们使用 seleniumwire 改造:
from seleniumwire import webdriver # 使用 seleniumwire 的 webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support import expected_conditions as EC
from selenium_stealth import stealth
import os
def create_data_directory():
"""
创建或获取项目脚本所在目录下的 data 目录。
"""
script_dir = os.path.dirname(os.path.abspath(__file__))
data_dir = os.path.join(script_dir, 'data')
if not os.path.exists(data_dir):
os.makedirs(data_dir)
print(f"已创建 data 目录: {data_dir}")
else:
print(f"data 目录已存在: {data_dir}")
return data_dir
def configure_chrome_options(data_dir):
options = Options()
options.add_argument('--disable-blink-features=AutomationControlled')
user_data_dir = os.path.join(data_dir, 'selenium_profile')
print("user_data_dir:" + user_data_dir)
options.add_argument(f"--user-data-dir={user_data_dir}")
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; zh-CN) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
options.add_argument(f'--user-agent={user_agent}')
options.add_argument('--start-maximized')
return options
data_dir = create_data_directory()
# 设定 chromedriver 的路径
chrome_driver_path = "/Users/cai/Documents/chromedriver-mac-x64/chromedriver" # 替换为你本地的路径
service = Service(executable_path=chrome_driver_path)
# 配置 ChromeOptions
options = configure_chrome_options(data_dir)
# 配置 selenium-wire 的选项
seleniumwire_options = {
'disable_encoding': True,
}
# 初始化 Selenium Wire WebDriver
# chrome_driver_path = "/Users/caizhongzhen/Documents/chromedriver-mac-x64/chromedriver" # 替换为你本地的路径
# service = Service(executable_path=chrome_driver_path)
driver = webdriver.Chrome(service=service, options=options, seleniumwire_options=seleniumwire_options)
# 使用 selenium-stealth 进行反检测处理,模拟中文 Windows 环境
stealth(driver,
languages=["zh-CN", "zh"],
vendor="Google Inc.",
platform="Win32",
webgl_vendor="Intel Inc.",
renderer="Intel Iris OpenGL Engine",
fix_hairline=True,
)
# Go to the Google home page
driver.get('https://www.xiaocaicai.com/')
# Access requests via the `requests` attribute
for request in driver.requests:
if request.response:
if 'apm-fe.xiaocaicai.com/api/data' in request.url:
print(
request.url,
request.response.status_code,
request.response.headers['Content-Type'],
request.body
)
可以看到我们使用stealth反检测之后,可以避免被检测的情况