LuTong 3 mesiacov pred
rodič
commit
4c653d6501
1 zmenil súbory, kde vykonal 29 pridanie a 16 odobranie
  1. 29 16
      src/scraper.py

+ 29 - 16
src/scraper.py

@@ -15,8 +15,9 @@ except ImportError:
 
 import time, random, re, os, subprocess, urllib.parse, json, traceback, socket
 from selenium import webdriver
+from selenium.webdriver.edge.options import Options as EdgeOptions
+from selenium.webdriver.edge.service import Service as EdgeService
 from selenium.webdriver.chrome.options import Options as ChromeOptions
-from selenium.webdriver.chrome.service import Service as ChromeService
 import undetected_chromedriver as uc 
 from selenium.webdriver.common.by import By
 from selenium.webdriver.common.action_chains import ActionChains
@@ -50,35 +51,47 @@ class Scraper1688:
                 cmd.append("--headless")
             
             try:
+                # 异步启动浏览器进程
                 subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
                 time.sleep(3) # 等待浏览器初始化
                 
-                # 4. 关键:使用 Chrome 类接管 Edge
-                # 这样可以解决 selenium-stealth 只支持 Chrome 类的问题
-                opts = ChromeOptions()
+                # 4. 接管 Edge
+                opts = EdgeOptions()
                 opts.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
                 
+                # 尝试连接
                 try:
-                    self.driver = webdriver.Chrome(options=opts)
-                    print("[+] Edge 浏览器已通过 Chrome 类接管成功!")
+                    # 使用真正的 Edge 驱动类连接,解决“unrecognized Chrome version”报错
+                    self.driver = webdriver.Edge(options=opts)
+                    print("[+] Edge 浏览器已成功自动弹出并接管!")
                 except:
-                    print("[*] 尝试自动下载匹配的驱动接管...")
-                    from webdriver_manager.chrome import ChromeDriverManager
-                    service = ChromeService(ChromeDriverManager().install())
-                    self.driver = webdriver.Chrome(service=service, options=opts)
-                    print("[+] Edge 浏览器已成功接管!")
+                    # 如果连不上,尝试使用 webdriver_manager 自动下载匹配驱动
+                    print("[*] 尝试自动下载匹配的 EdgeDriver...")
+                    from webdriver_manager.microsoft import EdgeChromiumDriverManager
+                    service = EdgeService(EdgeChromiumDriverManager().install())
+                    self.driver = webdriver.Edge(service=service, options=opts)
+                    print("[+] Edge 浏览器已通过驱动管理接管成功!")
                     
             except Exception as e:
-                print(f"[*] Edge 自动接管模式失败,准备回退: {e}")
+                print(f"[*] Edge 自动接管模式失败,准备回退到 Chrome: {e}")
         
-        # 5. 兜底方案
+        # 5. 兜底方案:如果 Edge 启动或接管失败,启动 Chrome
         if not self.driver:
             print("[*] 正在启动 Chrome (undetected-chromedriver) 模式...")
             self._init_chrome(headless)
 
         if self.driver:
-            # 此时 self.driver 无论是接管还是自启,都是 Chrome 类型,不会报错
-            stealth(self.driver, languages=["zh-CN", "zh"], vendor="Google Inc.", platform="Win32", fix_hairline=True)
+            # 关键:只有在使用 Chrome 模式时才应用 stealth
+            # 接管模式下的 Edge 是真实的浏览器进程,本身就具备极高的隐蔽性
+            if "chrome" in str(type(self.driver)).lower() and "edge" not in str(type(self.driver)).lower():
+                stealth(self.driver, languages=["zh-CN", "zh"], vendor="Google Inc.", platform="Win32", fix_hairline=True)
+            else:
+                # 针对 Edge 的轻量级反爬补丁(避开库类型检查错误)
+                self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
+                    "source": """
+                        Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
+                    """
+                })
 
     def _find_edge(self):
         """ 通过注册表寻找 Edge 精准安装路径 """
@@ -255,7 +268,7 @@ class Scraper1688:
         except: return None
 
     def _extract_all_methods(self):
-        """ 列表页多方式提取 """
+        """ 列表页提取 """
         results = []
         try:
             res = self.driver.execute_script("return JSON.stringify(window.data || window.__INITIAL_DATA__)")