4 月之前 · 238f0b333b
--- a/src/scraper.py
+++ b/src/scraper.py
@@ -103,15 +103,15 @@ class Scraper1688:
 
				             try:
			
 
				                 self._cleanup()
			
 
				                 time.sleep(2)
			
 
				-                # 兜底方案
			
 
				-                self.driver = uc.Chrome(options=create_options(), headless=headless, use_subprocess=True)
			
 
				+                # 兜底方案：不使用 subprocess
			
 
				+                self.driver = uc.Chrome(options=create_options(), headless=headless)
			
 
				                 print("[+] 自动兼容模式启动成功！")
			
 
				             except Exception as e2:
			
 
				                 print(f"[致命错误] 无法启动 Chrome: {e2}")
			
 
				                 raise Exception("无法拉起 Chrome，请尝试关闭杀毒软件或重新安装 Chrome。")
			
 
				 
			
 
				     def clean_url(self, url):
			
 
				-        """ 【核心订正】极其鲁棒的 ID 提取逻辑，强制转化为详情页链接，过滤店铺页 """
			
 
				+        """ 极其鲁棒的 ID 提取逻辑，强制转化为详情页链接，过滤店铺页 """
			
 
				         if not url: return ""
			
 
				         if url.startswith("//"): url = "https:" + url
			
 
				         
			
@@ -163,9 +163,8 @@ class Scraper1688:
 
				             print(f"[*] 正在处理列表页: 第 {page} 页...")
			
 
				             self.driver.get(f"{base_url}&beginPage={page}&page={page}")
			
 
				             self.check_for_captcha()
			
 
				-            # 增强滚动
			
 
				-            for i in range(1, 6):
			
 
				-                self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/5});")
			
 
				+            for i in range(1, 5):
			
 
				+                self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/4});")
			
 
				                 time.sleep(1.5)
			
 
				 
			
 
				             page_results = self._extract_all_methods()
			
@@ -222,15 +221,19 @@ class Scraper1688:
 
				                 except: pass
			
 
				                 return ""
			
 
				 
			
 
				+            def safe_text(by, sel):
			
 
				+                try: return self.driver.find_element(by, sel).text.strip()
			
 
				+                except: return ""
			
 
				+
			
 
				             trade = model.get("tradeModel", {}) if isinstance(model, dict) else {}
			
 
				             ranges = trade.get("disPriceRanges") or trade.get("currentPrices") or []
			
 
				             range_text = " / ".join([f"{r.get('beginAmount')}起 ¥{r.get('price') or r.get('discountPrice')}" for r in ranges])
			
 
				 
			
 
				             base_data = {
			
 
				-                "category": (model.get("offerDetail", {}).get("leafCategoryName", "") if isinstance(model, dict) else "") or self.driver.find_element(By.CSS_SELECTOR, "div[class*=breadcrumb] a:last-child").text.strip(),
			
 
				+                "category": (model.get("offerDetail", {}).get("leafCategoryName", "") if isinstance(model, dict) else "") or safe_text(By.CSS_SELECTOR, "div[class*=breadcrumb] a:last-child"),
			
 
				                 "brand": get_attr("品牌"),
			
 
				                 "name": (model.get("offerDetail", {}).get("subject", "") if isinstance(model, dict) else "") or self.driver.title.split('-')[0],
			
 
				-                "spec": get_attr("尺码") or get_attr("规格") or get_attr("型号"),
			
 
				+                "spec": get_attr("尺码") or get_attr("规格") or get_attr("型号") or safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='尺码' or span='规格']/following-sibling::td[1]//span[@class='field-value']"),
			
 
				                 "material": get_attr("材质") or get_attr("面料"),
			
 
				                 "price": "", 
			
 
				                 "moq": trade.get("beginAmount", ""),
			
@@ -282,13 +285,13 @@ class Scraper1688:
 
				                         results.append(row)
			
 
				                 return results
			
 
				             
			
 
				+            base_data["price"] = trade.get("minPrice", "")
			
 
				             return [base_data]
			
 
				         except: return None
			
 
				 
			
 
				     def _extract_all_methods(self):
			
 
				-        """ 强化版列表页提取：支持最新 1688 选择器 """
			
 
				+        """ 列表页提取 """
			
 
				         results = []
			
 
				-        # 1. JSON 提取
			
 
				         try:
			
 
				             res = self.driver.execute_script("return JSON.stringify(window.data || window.__INITIAL_DATA__)")
			
 
				             if res:
			
@@ -305,14 +308,13 @@ class Scraper1688:
 
				                     if link and "similar_search" not in link:
			
 
				                         results.append({"name": str(o.get('title', '')), "link": link})
			
 
				         except: pass
			
 
				-        # 2. 强力 DOM 扫描
			
 
				         if not results:
			
 
				-            selectors = [".sm-offer-item", ".offer-card-item", ".search-offer-item", "[class*='offer-card']", ".offer-item"]
			
 
				-            for s in selectors:
			
 
				+            # 引入最新版 1688 选择器，确保能抓到详情链接
			
 
				+            for s in [".sm-offer-item", ".offer-card-item", ".search-offer-item", "[class*='offer-card']", ".offer-item"]:
			
 
				                 for el in self.driver.find_elements(By.CSS_SELECTOR, s):
			
 
				                     try:
			
 
				                         link = el.find_element(By.TAG_NAME, "a").get_attribute("href")
			
 
				-                        if link and "similar_search" not in link:
			
 
				+                        if link and ("offer" in link or "item" in link) and "similar_search" not in link:
			
 
				                             results.append({"name": el.text.split('\n')[0][:50], "link": link})
			
 
				                     except: continue
			
 
				                 if results: break