LuTong 2 luni în urmă
părinte
comite
b3ace4d04a
1 a modificat fișierele cu 60 adăugiri și 55 ștergeri
  1. 60 55
      src/scraper.py

+ 60 - 55
src/scraper.py

@@ -1,4 +1,5 @@
-# 【版本:2026-01-16 13:45 - 变体与价格精准同步版】
+# 【版本:2026-01-16 14:00 - 变体精准解析终极版】
+# 针对 Python 3.12+ 移除 distutils 的兼容性补丁
 import sys
 try:
     import distutils
@@ -64,6 +65,7 @@ class Scraper1688:
     def _init_chrome(self, headless):
         """ 强化版 Chrome 启动逻辑 """
         chrome_path = self._find_chrome()
+        
         def create_options():
             opts = uc.ChromeOptions()
             opts.add_argument(f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')
@@ -75,14 +77,16 @@ class Scraper1688:
             opts.add_argument("--disable-dev-shm-usage")
             opts.add_argument("--remote-allow-origins=*")
             return opts
+
         try:
+            # 优先使用 subprocess 模式启动,解决 Win11 连接难题
             self.driver = uc.Chrome(options=create_options(), headless=headless, browser_executable_path=chrome_path, use_subprocess=True)
-        except Exception:
+        except:
             # 失败则尝试普通模式
             self.driver = uc.Chrome(options=create_options(), headless=headless, use_subprocess=True)
 
     def clean_url(self, url):
-        """ 鲁棒的 ID 提取并重组链接 """
+        """ 鲁棒的 ID 提取 logic """
         if not url: return ""
         url_str = str(url)
         if url_str.startswith("//"): url_str = "https:" + url_str
@@ -103,8 +107,7 @@ class Scraper1688:
             if self.status_callback: self.status_callback(True, msg)
             while is_blocked(): time.sleep(2)
             if self.status_callback: self.status_callback(False, "验证通过")
-            if self.log_callback: self.log_callback(
-                "<font color='orange'>验证成功,进入 120 秒冷却期重置行为指纹...</font>")
+            # 验证成功后强制冷却,防止二次封禁
             time.sleep(random.randint(60, 120))
         return True
 
@@ -117,25 +120,24 @@ class Scraper1688:
         page, initial_count = 1, len(all_links)
         
         while len(all_links) < total_count + initial_count:
-            page_anotation = f"[∫] 列表页采集: 第 {page} 页...";
-            print(page_anotation)
-            if self.log_callback: self.log_callback(page_anotation)
+            print(f"[*] 列表页采集: 第 {page} 页...")
             self.driver.get(f"{base_url}&beginPage={page}&page={page}")
             self.check_for_captcha()
             
-            # --- 强化:模拟真实人类分段滚动,触发懒加载 ---
+            # --- 强化:分段滚动激活懒加载,解决第一页只解析到一个的问题 ---
             for i in range(1, 13):
                 # 分段滑动
                 self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/12});")
-                time.sleep(random.uniform(1.2, 2.8))
-                # 关键:每隔几步向上“回弹”一下,这种非规律动作最能触发 1688 加载钩子
+                time.sleep(random.uniform(1.5, 3.0))
+                # 关键:向上“回弹”动作触发 1688 加载钩子
                 if i % 4 == 0:
                     self.driver.execute_script(f"window.scrollBy(0, -{random.randint(200, 500)});")
                     time.sleep(1.0)
-            time.sleep(random.uniform(3, 6))
+            
+            time.sleep(random.uniform(3, 6)) # 最终等待数据渲染
 
             page_results = self._extract_all_methods()
-            print(f"  [+] 本页解析完成:共发现 {len(page_results)} 个(潜在)商品链接")
+            print(f"  [+] 本页解析完成:共发现 {len(page_results)} 个商品链接")
             
             page_batch = []
             for it in page_results:
@@ -143,7 +145,7 @@ class Scraper1688:
                 if clean_url and clean_url not in all_links:
                     all_links.add(clean_url)
                     
-                    # 冷却机制
+                    # 保护机制
                     new_count = len(all_links) - initial_count
                     if new_count > 0 and new_count % 12 == 0:
                         rest_secs = random.randint(300, 600)
@@ -161,8 +163,7 @@ class Scraper1688:
                         yield page_batch
                         page_batch = []
                     
-                    # 详情页后的随机等待
-                    time.sleep(random.uniform(40, 80))
+                    time.sleep(random.uniform(40, 80)) 
                     if len(all_links) >= total_count + initial_count: break
             
             if page_batch: yield page_batch
@@ -172,7 +173,7 @@ class Scraper1688:
         return list(all_links)
 
     def scrape_detail(self, url):
-        """ 极精准变体解析:锁定 expand-view-list 并提取款式与逐条价格 """
+        """ 极精准变体解析:锁定 expand-view-list 区域并拆分价格与描述 """
         try:
             self.driver.get(url)
             # 仿真阅读
@@ -202,66 +203,70 @@ class Scraper1688:
                 "category": (model.get("offerDetail", {}).get("leafCategoryName", "") if isinstance(model, dict) else "") or self.driver.find_element(By.CSS_SELECTOR, "div[class*=breadcrumb] a:last-child").text.strip(),
                 "brand": get_attr("品牌"),
                 "name": (model.get("offerDetail", {}).get("subject", "") if isinstance(model, dict) else "") or self.driver.title.split('-')[0],
-                "spec": get_attr("尺码") or get_attr("规格") or get_attr("型号"),
+                "spec": "", # 待填充
+                "color": "", # 待填充
                 "material": get_attr("材质") or get_attr("面料"),
+                "price": "", # 待填充
                 "moq": trade.get("beginAmount", ""),
                 "wholesale_price": range_text,
                 "link": url,
                 "supplier": (model.get("sellerModel", {}).get("companyName", "") if isinstance(model, dict) else ""),
             }
 
-            # --- 核心逻辑订正:精准识别变体区域并拆分多行数据 ---
             variant_results = []
             try:
-                # 按照用户提供的线索,锁定核心容器
+                # --- 关键订正:基于用户发现的 expand-view-list 锁定变体区域 ---
                 # 兼容 expand-view-list 和 expand-view-list-wrapper
                 wrappers = self.driver.find_elements(By.CSS_SELECTOR, ".expand-view-list, .expand-view-list-wrapper")
                 if wrappers:
-                    # 1. 尝试直接获取所有 label 和 price 的对
-                    labels = wrappers[0].find_elements(By.CLASS_NAME, "item-label")
-                    prices = wrappers[0].find_elements(By.CLASS_NAME, "item-price-stock")
-                    
-                    if labels and prices and len(labels) == len(prices):
-                        for i in range(len(labels)):
-                            l_text = labels[i].text.strip()
-                            p_text = prices[i].text.strip()
+                    # 寻找每一个变体子项条目
+                    items = wrappers[0].find_elements(By.CSS_SELECTOR, ".expand-view-list-item, [class*='list-item'], .sku-item")
+                    for item_el in items:
+                        try:
+                            # 1. 描述文字文字 (item-label) -> 对应 Excel “规格尺码”和“颜色”列
+                            label_el = item_el.find_element(By.CLASS_NAME, "item-label")
+                            label_text = label_el.text.strip()
+                            
+                            # 2. 逐条对应的价格 (item-price-stock) -> 对应 Excel “单品进价(元)”列
+                            price_el = item_el.find_element(By.CLASS_NAME, "item-price-stock")
+                            price_raw = price_el.text.strip()
                             # 价格清洗:只保留数字和小数点
-                            p_clean = re.sub(r'[^\d.]', '', p_text)
-                            if l_text:
+                            price_clean = re.sub(r'[^\d.]', '', price_raw)
+                            
+                            if label_text:
                                 row = base_data.copy()
-                                row["color"] = l_text # 款式描述 -> 颜色列
-                                row["price"] = p_clean if p_clean else p_text # 逐条价格 -> 单品进价列
-                                if not row["spec"]: row["spec"] = l_text # 规格也同步填充
+                                # 按照用户要求进行映射
+                                row["spec"] = label_text   # 描述文字填入“规格尺码”
+                                row["color"] = label_text  # 同步填入“颜色”
+                                row["price"] = price_clean # 价格数字填入“单品进价(元)”
                                 variant_results.append(row)
-                    
-                    # 2. 如果数量对不上,尝试按照子容器逐项提取
-                    if not variant_results:
-                        items = wrappers[0].find_elements(By.CSS_SELECTOR, ".expand-view-list-item, [class*='list-item'], .sku-item")
-                        for item_el in items:
-                            try:
-                                l_el = item_el.find_element(By.CLASS_NAME, "item-label")
-                                p_el = item_el.find_element(By.CLASS_NAME, "item-price-stock")
-                                if l_el and p_el:
-                                    l_text = l_el.text.strip()
-                                    p_text = p_el.text.strip()
-                                    p_clean = re.sub(r'[^\d.]', '', p_text)
-                                    if l_text:
-                                        row = base_data.copy()
-                                        row["color"] = l_text
-                                        row["price"] = p_clean if p_clean else p_text
-                                        if not row["spec"]: row["spec"] = l_text
-                                        variant_results.append(row)
-                            except: continue
-            except Exception as e:
-                print(f"  [!] 变体区域解析异常: {e}")
+                        except: continue
+            except: pass
 
             if variant_results:
-                print(f"  [+] 成功解析到 {len(variant_results)} 个款式变体")
                 return variant_results
+            
+            # 方案 B: 回退到模型提取
+            sku_props = model.get("skuModel", {}).get("skuProps", []) or model.get("detailData", {}).get("skuProps", []) or []
+            main_prop = next((p for p in sku_props if any(k in p.get("prop", "") for k in ["颜色", "分类", "款式", "花色", "净含量"])), None)
+            if not main_prop and sku_props: main_prop = sku_props[0]
+            if main_prop and main_prop.get("value"):
+                results = []
+                for val in main_prop["value"]:
+                    if val.get("name"):
+                        row = base_data.copy()
+                        row["color"] = val.get("name")
+                        row["spec"] = val.get("name")
+                        row["price"] = trade.get("minPrice", "")
+                        results.append(row)
+                return results
+            
+            base_data["price"] = trade.get("minPrice", "")
             return [base_data]
         except: return None
 
     def _extract_all_methods(self):
+        """ 强化版探测:从内存变量中抓取列表 """
         results = []
         seen_ids = set()
         def add_item(name, link):