4 달 전 · b3ace4d04a
--- a/src/scraper.py
+++ b/src/scraper.py
@@ -1,4 +1,5 @@
 
				-# 【版本：2026-01-16 13:45 - 变体与价格精准同步版】
			
 
				+# 【版本：2026-01-16 14:00 - 变体精准解析终极版】
			
 
				+# 针对 Python 3.12+ 移除 distutils 的兼容性补丁
			
 
				 import sys
			
 
				 try:
			
 
				     import distutils
			
@@ -64,6 +65,7 @@ class Scraper1688:
 
				     def _init_chrome(self, headless):
			
 
				         """ 强化版 Chrome 启动逻辑 """
			
 
				         chrome_path = self._find_chrome()
			
 
				+        
			
 
				         def create_options():
			
 
				             opts = uc.ChromeOptions()
			
 
				             opts.add_argument(f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')
			
@@ -75,14 +77,16 @@ class Scraper1688:
 
				             opts.add_argument("--disable-dev-shm-usage")
			
 
				             opts.add_argument("--remote-allow-origins=*")
			
 
				             return opts
			
 
				+
			
 
				         try:
			
 
				+            # 优先使用 subprocess 模式启动，解决 Win11 连接难题
			
 
				             self.driver = uc.Chrome(options=create_options(), headless=headless, browser_executable_path=chrome_path, use_subprocess=True)
			
 
				-        except Exception:
			
 
				+        except:
			
 
				             # 失败则尝试普通模式
			
 
				             self.driver = uc.Chrome(options=create_options(), headless=headless, use_subprocess=True)
			
 
				 
			
 
				     def clean_url(self, url):
			
 
				-        """ 鲁棒的 ID 提取并重组链接 """
			
 
				+        """ 鲁棒的 ID 提取 logic """
			
 
				         if not url: return ""
			
 
				         url_str = str(url)
			
 
				         if url_str.startswith("//"): url_str = "https:" + url_str
			
@@ -103,8 +107,7 @@ class Scraper1688:
 
				             if self.status_callback: self.status_callback(True, msg)
			
 
				             while is_blocked(): time.sleep(2)
			
 
				             if self.status_callback: self.status_callback(False, "验证通过")
			
 
				-            if self.log_callback: self.log_callback(
			
 
				-                "<font color='orange'>验证成功，进入 120 秒冷却期重置行为指纹...</font>")
			
 
				+            # 验证成功后强制冷却，防止二次封禁
			
 
				             time.sleep(random.randint(60, 120))
			
 
				         return True
			
 
				 
			
@@ -117,25 +120,24 @@ class Scraper1688:
 
				         page, initial_count = 1, len(all_links)
			
 
				         
			
 
				         while len(all_links) < total_count + initial_count:
			
 
				-            page_anotation = f"[∫] 列表页采集: 第 {page} 页...";
			
 
				-            print(page_anotation)
			
 
				-            if self.log_callback: self.log_callback(page_anotation)
			
 
				+            print(f"[*] 列表页采集: 第 {page} 页...")
			
 
				             self.driver.get(f"{base_url}&beginPage={page}&page={page}")
			
 
				             self.check_for_captcha()
			
 
				             
			
 
				-            # --- 强化：模拟真实人类分段滚动，触发懒加载 ---
			
 
				+            # --- 强化：分段滚动激活懒加载，解决第一页只解析到一个的问题 ---
			
 
				             for i in range(1, 13):
			
 
				                 # 分段滑动
			
 
				                 self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/12});")
			
 
				-                time.sleep(random.uniform(1.2, 2.8))
			
 
				-                # 关键：每隔几步向上“回弹”一下，这种非规律动作最能触发 1688 的加载钩子
			
 
				+                time.sleep(random.uniform(1.5, 3.0))
			
 
				+                # 关键：向上“回弹”动作触发 1688 加载钩子
			
 
				                 if i % 4 == 0:
			
 
				                     self.driver.execute_script(f"window.scrollBy(0, -{random.randint(200, 500)});")
			
 
				                     time.sleep(1.0)
			
 
				-            time.sleep(random.uniform(3, 6))
			
 
				+            
			
 
				+            time.sleep(random.uniform(3, 6)) # 最终等待数据渲染
			
 
				 
			
 
				             page_results = self._extract_all_methods()
			
 
				-            print(f"  [+] 本页解析完成：共发现 {len(page_results)} 个（潜在）商品链接")
			
 
				+            print(f"  [+] 本页解析完成：共发现 {len(page_results)} 个商品链接")
			
 
				             
			
 
				             page_batch = []
			
 
				             for it in page_results:
			
@@ -143,7 +145,7 @@ class Scraper1688:
 
				                 if clean_url and clean_url not in all_links:
			
 
				                     all_links.add(clean_url)
			
 
				                     
			
 
				-                    # 冷却机制
			
 
				+                    # 保护机制
			
 
				                     new_count = len(all_links) - initial_count
			
 
				                     if new_count > 0 and new_count % 12 == 0:
			
 
				                         rest_secs = random.randint(300, 600)
			
@@ -161,8 +163,7 @@ class Scraper1688:
 
				                         yield page_batch
			
 
				                         page_batch = []
			
 
				                     
			
 
				-                    # 详情页后的随机等待
			
 
				-                    time.sleep(random.uniform(40, 80))
			
 
				+                    time.sleep(random.uniform(40, 80)) 
			
 
				                     if len(all_links) >= total_count + initial_count: break
			
 
				             
			
 
				             if page_batch: yield page_batch
			
@@ -172,7 +173,7 @@ class Scraper1688:
 
				         return list(all_links)
			
 
				 
			
 
				     def scrape_detail(self, url):
			
 
				-        """ 极精准变体解析：锁定 expand-view-list 并提取款式与逐条价格 """
			
 
				+        """ 极精准变体解析：锁定 expand-view-list 区域并拆分价格与描述 """
			
 
				         try:
			
 
				             self.driver.get(url)
			
 
				             # 仿真阅读
			
@@ -202,66 +203,70 @@ class Scraper1688:
 
				                 "category": (model.get("offerDetail", {}).get("leafCategoryName", "") if isinstance(model, dict) else "") or self.driver.find_element(By.CSS_SELECTOR, "div[class*=breadcrumb] a:last-child").text.strip(),
			
 
				                 "brand": get_attr("品牌"),
			
 
				                 "name": (model.get("offerDetail", {}).get("subject", "") if isinstance(model, dict) else "") or self.driver.title.split('-')[0],
			
 
				-                "spec": get_attr("尺码") or get_attr("规格") or get_attr("型号"),
			
 
				+                "spec": "", # 待填充
			
 
				+                "color": "", # 待填充
			
 
				                 "material": get_attr("材质") or get_attr("面料"),
			
 
				+                "price": "", # 待填充
			
 
				                 "moq": trade.get("beginAmount", ""),
			
 
				                 "wholesale_price": range_text,
			
 
				                 "link": url,
			
 
				                 "supplier": (model.get("sellerModel", {}).get("companyName", "") if isinstance(model, dict) else ""),
			
 
				             }
			
 
				 
			
 
				-            # --- 核心逻辑订正：精准识别变体区域并拆分多行数据 ---
			
 
				             variant_results = []
			
 
				             try:
			
 
				-                # 按照用户提供的线索，锁定核心容器
			
 
				+                # --- 关键订正：基于用户发现的 expand-view-list 锁定变体区域 ---
			
 
				                 # 兼容 expand-view-list 和 expand-view-list-wrapper
			
 
				                 wrappers = self.driver.find_elements(By.CSS_SELECTOR, ".expand-view-list, .expand-view-list-wrapper")
			
 
				                 if wrappers:
			
 
				-                    # 1. 尝试直接获取所有 label 和 price 的对
			
 
				-                    labels = wrappers[0].find_elements(By.CLASS_NAME, "item-label")
			
 
				-                    prices = wrappers[0].find_elements(By.CLASS_NAME, "item-price-stock")
			
 
				-                    
			
 
				-                    if labels and prices and len(labels) == len(prices):
			
 
				-                        for i in range(len(labels)):
			
 
				-                            l_text = labels[i].text.strip()
			
 
				-                            p_text = prices[i].text.strip()
			
 
				+                    # 寻找每一个变体子项条目
			
 
				+                    items = wrappers[0].find_elements(By.CSS_SELECTOR, ".expand-view-list-item, [class*='list-item'], .sku-item")
			
 
				+                    for item_el in items:
			
 
				+                        try:
			
 
				+                            # 1. 描述文字文字 (item-label) -> 对应 Excel “规格尺码”和“颜色”列
			
 
				+                            label_el = item_el.find_element(By.CLASS_NAME, "item-label")
			
 
				+                            label_text = label_el.text.strip()
			
 
				+                            
			
 
				+                            # 2. 逐条对应的价格 (item-price-stock) -> 对应 Excel “单品进价（元）”列
			
 
				+                            price_el = item_el.find_element(By.CLASS_NAME, "item-price-stock")
			
 
				+                            price_raw = price_el.text.strip()
			
 
				                             # 价格清洗：只保留数字和小数点
			
 
				-                            p_clean = re.sub(r'[^\d.]', '', p_text)
			
 
				-                            if l_text:
			
 
				+                            price_clean = re.sub(r'[^\d.]', '', price_raw)
			
 
				+                            
			
 
				+                            if label_text:
			
 
				                                 row = base_data.copy()
			
 
				-                                row["color"] = l_text # 款式描述 -> 颜色列
			
 
				-                                row["price"] = p_clean if p_clean else p_text # 逐条价格 -> 单品进价列
			
 
				-                                if not row["spec"]: row["spec"] = l_text # 规格也同步填充
			
 
				+                                # 按照用户要求进行映射
			
 
				+                                row["spec"] = label_text   # 描述文字填入“规格尺码”
			
 
				+                                row["color"] = label_text  # 同步填入“颜色”
			
 
				+                                row["price"] = price_clean # 价格数字填入“单品进价（元）”
			
 
				                                 variant_results.append(row)
			
 
				-                    
			
 
				-                    # 2. 如果数量对不上，尝试按照子容器逐项提取
			
 
				-                    if not variant_results:
			
 
				-                        items = wrappers[0].find_elements(By.CSS_SELECTOR, ".expand-view-list-item, [class*='list-item'], .sku-item")
			
 
				-                        for item_el in items:
			
 
				-                            try:
			
 
				-                                l_el = item_el.find_element(By.CLASS_NAME, "item-label")
			
 
				-                                p_el = item_el.find_element(By.CLASS_NAME, "item-price-stock")
			
 
				-                                if l_el and p_el:
			
 
				-                                    l_text = l_el.text.strip()
			
 
				-                                    p_text = p_el.text.strip()
			
 
				-                                    p_clean = re.sub(r'[^\d.]', '', p_text)
			
 
				-                                    if l_text:
			
 
				-                                        row = base_data.copy()
			
 
				-                                        row["color"] = l_text
			
 
				-                                        row["price"] = p_clean if p_clean else p_text
			
 
				-                                        if not row["spec"]: row["spec"] = l_text
			
 
				-                                        variant_results.append(row)
			
 
				-                            except: continue
			
 
				-            except Exception as e:
			
 
				-                print(f"  [!] 变体区域解析异常: {e}")
			
 
				+                        except: continue
			
 
				+            except: pass
			
 
				 
			
 
				             if variant_results:
			
 
				-                print(f"  [+] 成功解析到 {len(variant_results)} 个款式变体")
			
 
				                 return variant_results
			
 
				+            
			
 
				+            # 方案 B: 回退到模型提取
			
 
				+            sku_props = model.get("skuModel", {}).get("skuProps", []) or model.get("detailData", {}).get("skuProps", []) or []
			
 
				+            main_prop = next((p for p in sku_props if any(k in p.get("prop", "") for k in ["颜色", "分类", "款式", "花色", "净含量"])), None)
			
 
				+            if not main_prop and sku_props: main_prop = sku_props[0]
			
 
				+            if main_prop and main_prop.get("value"):
			
 
				+                results = []
			
 
				+                for val in main_prop["value"]:
			
 
				+                    if val.get("name"):
			
 
				+                        row = base_data.copy()
			
 
				+                        row["color"] = val.get("name")
			
 
				+                        row["spec"] = val.get("name")
			
 
				+                        row["price"] = trade.get("minPrice", "")
			
 
				+                        results.append(row)
			
 
				+                return results
			
 
				+            
			
 
				+            base_data["price"] = trade.get("minPrice", "")
			
 
				             return [base_data]
			
 
				         except: return None
			
 
				 
			
 
				     def _extract_all_methods(self):
			
 
				+        """ 强化版探测：从内存变量中抓取列表 """
			
 
				         results = []
			
 
				         seen_ids = set()
			
 
				         def add_item(name, link):