Browse Source

回归Chrome,规格

LuTong 3 months ago
parent
commit
8b81ac6c6f
2 changed files with 55 additions and 9 deletions
  1. 7 2
      src/gui.py
  2. 48 7
      src/scraper.py

+ 7 - 2
src/gui.py

@@ -52,15 +52,20 @@ class ScraperThread(QThread):
             
             
             # 使用流式生成器抓取
             # 使用流式生成器抓取
             collected_count = 0
             collected_count = 0
+            product_index = 0
             
             
             for batch_results in scraper.search_products_yield(self.keyword, total_count=self.total_count, existing_links=existing_links):
             for batch_results in scraper.search_products_yield(self.keyword, total_count=self.total_count, existing_links=existing_links):
                 # 实时写入 Excel (此时 batch_results 为 10 条或页末余数)
                 # 实时写入 Excel (此时 batch_results 为 10 条或页末余数)
                 append_to_template(batch_results, self.output_path, status_callback=status_cb)
                 append_to_template(batch_results, self.output_path, status_callback=status_cb)
                 
                 
+                # 计算本批次包含的独立商品数量
+                unique_links_in_batch = len(set(item.get('link') for item in batch_results if item.get('link')))
+                product_index += unique_links_in_batch
                 collected_count += len(batch_results)
                 collected_count += len(batch_results)
-                self.log.emit(f"[+] 新增数据已持久化: {len(batch_results)} 条,本次共计: {collected_count}")
                 
                 
-                prog = int((collected_count / self.total_count) * 100)
+                self.log.emit(f"[+] 解析到第 {product_index} 个商品,新增数据已持久化: {len(batch_results)} 条,本次共计: {collected_count}")
+                
+                prog = int((product_index / self.total_count) * 100)
                 self.progress.emit(min(prog, 100))
                 self.progress.emit(min(prog, 100))
             
             
             duration = time.time() - start_time
             duration = time.time() - start_time

+ 48 - 7
src/scraper.py

@@ -272,7 +272,49 @@ class Scraper1688:
                            or safe_text(By.CSS_SELECTOR, "div.company-name"),
                            or safe_text(By.CSS_SELECTOR, "div.company-name"),
             }
             }
 
 
-            # --- 核心逻辑:拆分规格/颜色分类 ---
+            # --- 核心逻辑订正:智能识别规格区域(优先使用 DOM 检查,获取款式名称和逐条价格) ---
+            variant_data_list = []
+            try:
+                # 尝试用户提供的特定 DOM 容器
+                wrappers = self.driver.find_elements(By.CLASS_NAME, "expand-view-list-wrapper")
+                if wrappers:
+                    # 获取该容器下的所有子项容器(通常包含 item-label 和 item-price-stock)
+                    # 尝试定位包含这两者的条目级容器
+                    items = wrappers[0].find_elements(By.CSS_SELECTOR, ".expand-view-list-item, [class*='list-item'], .sku-item")
+                    
+                    if not items:
+                        # 如果找不到明确的子项容器,则根据 label 元素反向寻找或直接成对提取
+                        labels = wrappers[0].find_elements(By.CLASS_NAME, "item-label")
+                        prices = wrappers[0].find_elements(By.CLASS_NAME, "item-price-stock")
+                        for l, p in zip(labels, prices):
+                            variant_data_list.append({
+                                "label": l.text.strip(),
+                                "price": p.text.strip()
+                            })
+                    else:
+                        for item_el in items:
+                            try:
+                                label = item_el.find_element(By.CLASS_NAME, "item-label").text.strip()
+                                price = item_el.find_element(By.CLASS_NAME, "item-price-stock").text.strip()
+                                if label:
+                                    variant_data_list.append({"label": label, "price": price})
+                            except: continue
+            except: pass
+
+            if variant_data_list:
+                results = []
+                for vd in variant_data_list:
+                    row = base_data.copy()
+                    # 款式描述写入“颜色”列
+                    row["color"] = vd["label"]
+                    # 逐条价格写入“单品进价(元)”列 (即 price 键)
+                    # 清洗价格,移除 ¥ 等非数字字符,只保留数字和小数点
+                    clean_price = re.sub(r'[^\d.]', '', vd["price"])
+                    row["price"] = clean_price if clean_price else vd["price"]
+                    results.append(row)
+                return results
+
+            # --- 方案 B:如果 DOM 探测失败,回退到 JS 模型提取 ---
             sku_props = []
             sku_props = []
             try:
             try:
                 # 尝试多种路径获取 SKU 属性
                 # 尝试多种路径获取 SKU 属性
@@ -282,18 +324,17 @@ class Scraper1688:
             except: pass
             except: pass
 
 
             # 智能寻找主维度:
             # 智能寻找主维度:
-            # 1. 优先找包含“颜色”、“分类”、“款式”、“花色”的维度
-            # 2. 如果没有,则取第一个 SKU 维度(例如“净含量”、“规格”等)
+            # 1. 优先找包含“颜色”、“分类”、“款式”、“花色”、“净含量”的维度
+            # 2. 如果没有,则取第一个 SKU 维度
             main_prop = None
             main_prop = None
             if sku_props:
             if sku_props:
-                main_prop = next((p for p in sku_props if any(k in p.get("prop", "") for k in ["颜色", "分类", "款式", "花色"])), None)
+                main_prop = next((p for p in sku_props if any(k in p.get("prop", "") for k in ["颜色", "分类", "款式", "花色", "净含量", "规格"])), None)
                 if not main_prop:
                 if not main_prop:
                     main_prop = sku_props[0]
                     main_prop = sku_props[0]
             
             
             if main_prop and main_prop.get("value"):
             if main_prop and main_prop.get("value"):
                 variant_results = []
                 variant_results = []
                 for val in main_prop["value"]:
                 for val in main_prop["value"]:
-                    # 只有当该分类确实有名字时才记录
                     variant_name = val.get("name")
                     variant_name = val.get("name")
                     if variant_name:
                     if variant_name:
                         row = base_data.copy()
                         row = base_data.copy()
@@ -301,8 +342,8 @@ class Scraper1688:
                         variant_results.append(row)
                         variant_results.append(row)
                 return variant_results
                 return variant_results
             else:
             else:
-                # 兜底:如果没有发现规格选择区,则获取单属性颜色
-                base_data["color"] = get_attr("颜色") or get_attr("颜色分类") or ""
+                # 最终兜底:单属性
+                base_data["color"] = get_attr("颜色") or get_attr("颜色分类") or get_attr("净含量") or ""
                 return [base_data]
                 return [base_data]
 
 
         except Exception as e:
         except Exception as e: