LuTong 1 месяц назад
Родитель
Сommit
f54f68e0f6
1 измененных файлов с 44 добавлено и 16 удалено
  1. 44 16
      src/scraper.py

+ 44 - 16
src/scraper.py

@@ -44,6 +44,29 @@ class Scraper1688:
             except: continue
         return None
 
+    def _get_chrome_version_main(self):
+        """从 Windows 注册表读取本机 Chrome 主版本号(如 145),用于匹配 ChromeDriver,避免版本不一致。"""
+        if os.name != "nt":
+            return None
+        import winreg
+        reg_paths = [
+            (winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Wow6432Node\Microsoft\Windows\CurrentVersion\Uninstall\Google Chrome"),
+            (winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Microsoft\Windows\CurrentVersion\Uninstall\Google Chrome"),
+            (winreg.HKEY_CURRENT_USER, r"SOFTWARE\Microsoft\Windows\CurrentVersion\Uninstall\Google Chrome"),
+        ]
+        for hkey, subkey in reg_paths:
+            for value_name in ("DisplayVersion", "version"):
+                try:
+                    with winreg.OpenKey(hkey, subkey) as key:
+                        ver, _ = winreg.QueryValueEx(key, value_name)
+                        if ver:
+                            main = int(str(ver).split(".")[0])
+                            if 80 <= main <= 200:
+                                return main
+                except Exception:
+                    continue
+        return None
+
     def _cleanup(self):
         if os.name == 'nt':
             for proc in ['chrome.exe', 'chromedriver.exe']:
@@ -58,6 +81,7 @@ class Scraper1688:
 
     def _init_chrome(self, headless):
         chrome_path = self._find_chrome()
+        version_main = self._get_chrome_version_main()
         def create_options():
             opts = uc.ChromeOptions()
             opts.add_argument(f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')
@@ -69,12 +93,16 @@ class Scraper1688:
             opts.add_argument("--disable-dev-shm-usage")
             opts.add_argument("--remote-allow-origins=*")
             return opts
+        kwargs = {"options": create_options(), "headless": headless, "use_subprocess": True}
+        if version_main is not None:
+            kwargs["version_main"] = version_main
+        if chrome_path:
+            kwargs["browser_executable_path"] = chrome_path
         try:
-            self.driver = uc.Chrome(options=create_options(), headless=headless, browser_executable_path=chrome_path, use_subprocess=True)
-            # self.driver = uc.Chrome(options=create_options(), headless=headless, browser_executable_path=chrome_path, use_subprocess=True)
-        except :
-        # except Exception:
-            self.driver = uc.Chrome(options=create_options(), headless=headless, use_subprocess=True)
+            self.driver = uc.Chrome(**kwargs)
+        except Exception:
+            kwargs.pop("version_main", None)
+            self.driver = uc.Chrome(**kwargs)
 
     def clean_url(self, url):
         if not url: return ""
@@ -111,14 +139,14 @@ class Scraper1688:
             self.driver.get(f"{base_url}&beginPage={page}&page={page}")
             self.check_for_captcha()
             
-            # --- 关键:脉冲式分段滚动,强制触发懒加载 ---
-            for i in range(1, 16):
-                self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/15});")
-                time.sleep(random.uniform(1.2, 2.5))
-                if i % 4 == 0:
-                    self.driver.execute_script(f"window.scrollBy(0, -400);")
-                    time.sleep(1.0)
-            time.sleep(random.uniform(3, 6))
+            # --- 脉冲式分段滚动触发懒加载(已压缩等待,兼顾速度与加载)---
+            for i in range(1, 9):
+                self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/8});")
+                time.sleep(random.uniform(0.35, 0.85))
+                if i % 3 == 0:
+                    self.driver.execute_script("window.scrollBy(0, -300);")
+                    time.sleep(0.4)
+            time.sleep(random.uniform(1.2, 2.8))
 
             page_results = self._extract_all_methods()
             print(f"  [+] 本页解析完成:共发现 {len(page_results)} 个商品链接")
@@ -146,13 +174,13 @@ class Scraper1688:
                         yield page_batch
                         page_batch = []
                     
-                    time.sleep(random.uniform(40, 80)) 
+                    time.sleep(random.uniform(15, 35))
                     if len(all_links) >= total_count + initial_count: break
             
             if page_batch: yield page_batch
             page += 1
             self.driver.get("https://www.1688.com")
-            time.sleep(60)
+            time.sleep(random.uniform(18, 38))
         return list(all_links)
 
     def scrape_detail(self, url):
@@ -171,7 +199,7 @@ class Scraper1688:
                 for btn in expand_btns:
                     if btn.is_displayed():
                         self.driver.execute_script("arguments[0].click();", btn)
-                        time.sleep(1.5)
+                        time.sleep(0.6)
             except: pass
 
             self.check_for_captcha()