LuTong 3 months ago
parent
commit
238f0b333b
1 changed files with 16 additions and 14 deletions
  1. 16 14
      src/scraper.py

+ 16 - 14
src/scraper.py

@@ -103,15 +103,15 @@ class Scraper1688:
             try:
                 self._cleanup()
                 time.sleep(2)
-                # 兜底方案
-                self.driver = uc.Chrome(options=create_options(), headless=headless, use_subprocess=True)
+                # 兜底方案:不使用 subprocess
+                self.driver = uc.Chrome(options=create_options(), headless=headless)
                 print("[+] 自动兼容模式启动成功!")
             except Exception as e2:
                 print(f"[致命错误] 无法启动 Chrome: {e2}")
                 raise Exception("无法拉起 Chrome,请尝试关闭杀毒软件或重新安装 Chrome。")
 
     def clean_url(self, url):
-        """ 【核心订正】极其鲁棒的 ID 提取逻辑,强制转化为详情页链接,过滤店铺页 """
+        """ 极其鲁棒的 ID 提取逻辑,强制转化为详情页链接,过滤店铺页 """
         if not url: return ""
         if url.startswith("//"): url = "https:" + url
         
@@ -163,9 +163,8 @@ class Scraper1688:
             print(f"[*] 正在处理列表页: 第 {page} 页...")
             self.driver.get(f"{base_url}&beginPage={page}&page={page}")
             self.check_for_captcha()
-            # 增强滚动
-            for i in range(1, 6):
-                self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/5});")
+            for i in range(1, 5):
+                self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/4});")
                 time.sleep(1.5)
 
             page_results = self._extract_all_methods()
@@ -222,15 +221,19 @@ class Scraper1688:
                 except: pass
                 return ""
 
+            def safe_text(by, sel):
+                try: return self.driver.find_element(by, sel).text.strip()
+                except: return ""
+
             trade = model.get("tradeModel", {}) if isinstance(model, dict) else {}
             ranges = trade.get("disPriceRanges") or trade.get("currentPrices") or []
             range_text = " / ".join([f"{r.get('beginAmount')}起 ¥{r.get('price') or r.get('discountPrice')}" for r in ranges])
 
             base_data = {
-                "category": (model.get("offerDetail", {}).get("leafCategoryName", "") if isinstance(model, dict) else "") or self.driver.find_element(By.CSS_SELECTOR, "div[class*=breadcrumb] a:last-child").text.strip(),
+                "category": (model.get("offerDetail", {}).get("leafCategoryName", "") if isinstance(model, dict) else "") or safe_text(By.CSS_SELECTOR, "div[class*=breadcrumb] a:last-child"),
                 "brand": get_attr("品牌"),
                 "name": (model.get("offerDetail", {}).get("subject", "") if isinstance(model, dict) else "") or self.driver.title.split('-')[0],
-                "spec": get_attr("尺码") or get_attr("规格") or get_attr("型号"),
+                "spec": get_attr("尺码") or get_attr("规格") or get_attr("型号") or safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='尺码' or span='规格']/following-sibling::td[1]//span[@class='field-value']"),
                 "material": get_attr("材质") or get_attr("面料"),
                 "price": "", 
                 "moq": trade.get("beginAmount", ""),
@@ -282,13 +285,13 @@ class Scraper1688:
                         results.append(row)
                 return results
             
+            base_data["price"] = trade.get("minPrice", "")
             return [base_data]
         except: return None
 
     def _extract_all_methods(self):
-        """ 强化版列表页提取:支持最新 1688 选择器 """
+        """ 列表页提取 """
         results = []
-        # 1. JSON 提取
         try:
             res = self.driver.execute_script("return JSON.stringify(window.data || window.__INITIAL_DATA__)")
             if res:
@@ -305,14 +308,13 @@ class Scraper1688:
                     if link and "similar_search" not in link:
                         results.append({"name": str(o.get('title', '')), "link": link})
         except: pass
-        # 2. 强力 DOM 扫描
         if not results:
-            selectors = [".sm-offer-item", ".offer-card-item", ".search-offer-item", "[class*='offer-card']", ".offer-item"]
-            for s in selectors:
+            # 引入最新版 1688 选择器,确保能抓到详情链接
+            for s in [".sm-offer-item", ".offer-card-item", ".search-offer-item", "[class*='offer-card']", ".offer-item"]:
                 for el in self.driver.find_elements(By.CSS_SELECTOR, s):
                     try:
                         link = el.find_element(By.TAG_NAME, "a").get_attribute("href")
-                        if link and "similar_search" not in link:
+                        if link and ("offer" in link or "item" in link) and "similar_search" not in link:
                             results.append({"name": el.text.split('\n')[0][:50], "link": link})
                     except: continue
                 if results: break