пре 3 месеци · e27b62117f
--- a/src/excel_handler.py
+++ b/src/excel_handler.py
@@ -12,6 +12,32 @@ def get_resource_path(relative_path):
 
				     base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
			
 
				     return os.path.join(base_dir, relative_path)
			
 
				 
			
 
				+def get_existing_info(file_path):
			
 
				+    """
			
 
				+    读取已有文件中的链接和最后一行编码
			
 
				+    """
			
 
				+    links = set()
			
 
				+    last_code = 0
			
 
				+    if not os.path.exists(file_path):
			
 
				+        return links, last_code
			
 
				+    
			
 
				+    try:
			
 
				+        wb = load_workbook(file_path, data_only=True)
			
 
				+        ws = wb.active
			
 
				+        # 假设 A 列是编码，K 列是链接
			
 
				+        for r in range(3, ws.max_row + 1):
			
 
				+            code_val = ws.cell(row=r, column=1).value
			
 
				+            link_val = ws.cell(row=r, column=11).value
			
 
				+            
			
 
				+            if link_val:
			
 
				+                links.add(str(link_val).strip())
			
 
				+            
			
 
				+            if isinstance(code_val, (int, float)):
			
 
				+                last_code = max(last_code, int(code_val))
			
 
				+    except:
			
 
				+        pass
			
 
				+    return links, last_code
			
 
				+
			
 
				 def append_to_template(products, output_path, status_callback=None):
			
 
				     """
			
 
				     将产品数据追加写入到指定的 Excel 文件中。
			
@@ -36,13 +62,12 @@ def append_to_template(products, output_path, status_callback=None):
 
				     
			
 
				     ws = wb.active
			
 
				     
			
 
				-    # 3. 寻找起始行 (品类或名称为空的行)
			
 
				+    # 3. 寻找起始行 (基于第 11 列“产品链接”进行判定，防止覆盖)
			
 
				     start_row = 3
			
 
				     for r in range(3, ws.max_row + 2):
			
 
				-        val_cat = ws.cell(row=r, column=2).value
			
 
				-        val_name = ws.cell(row=r, column=4).value
			
 
				-        if (val_cat is None or str(val_cat).strip() == "") and \
			
 
				-           (val_name is None or str(val_name).strip() == ""):
			
 
				+        # 第 11 列是产品链接
			
 
				+        val_link = ws.cell(row=r, column=11).value
			
 
				+        if val_link is None or str(val_link).strip() == "":
			
 
				             start_row = r
			
 
				             break
			
 
				     else:
			
--- a/src/gui.py
+++ b/src/gui.py
@@ -5,12 +5,18 @@ import traceback
 
				 import pandas as pd
			
 
				 from PyQt6.QtWidgets import (QApplication, QMainWindow, QWidget, QVBoxLayout, 
			
 
				                              QHBoxLayout, QLineEdit, QPushButton, QTextEdit, 
			
 
				-                             QLabel, QFileDialog, QProgressBar, QTreeView, QSplitter, QCheckBox)
			
 
				+                             QLabel, QFileDialog, QProgressBar, QTreeView, QSplitter, QCheckBox, QSpinBox)
			
 
				 from PyQt6.QtCore import QThread, pyqtSignal, Qt
			
 
				-from PyQt6.QtGui import QStandardItemModel, QStandardItem
			
 
				+from PyQt6.QtGui import QStandardItemModel, QStandardItem, QIcon
			
 
				 
			
 
				 from src.scraper import Scraper1688
			
 
				-from src.excel_handler import append_to_template
			
 
				+from src.excel_handler import append_to_template, get_existing_info
			
 
				+
			
 
				+def get_resource_path(relative_path):
			
 
				+    """ 获取资源绝对路径，兼容开发环境和 PyInstaller 打包环境 """
			
 
				+    if hasattr(sys, '_MEIPASS'):
			
 
				+        return os.path.join(sys._MEIPASS, relative_path)
			
 
				+    return os.path.join(os.getcwd(), relative_path)
			
 
				 
			
 
				 class ScraperThread(QThread):
			
 
				     progress = pyqtSignal(int)
			
@@ -18,16 +24,22 @@ class ScraperThread(QThread):
 
				     # finished 信号增加耗时参数 (秒)
			
 
				     finished = pyqtSignal(str, object, float)
			
 
				 
			
 
				-    def __init__(self, keyword, output_path, headless=True):
			
 
				+    def __init__(self, keyword, output_path, total_count, headless=True):
			
 
				         super().__init__()
			
 
				         self.keyword = keyword
			
 
				         self.output_path = output_path
			
 
				+        self.total_count = total_count
			
 
				         self.headless = headless
			
 
				 
			
 
				     def run(self):
			
 
				         scraper = None
			
 
				         start_time = time.time()
			
 
				         try:
			
 
				+            # 读取已抓取的链接，实现断点续爬
			
 
				+            existing_links, _ = get_existing_info(self.output_path)
			
 
				+            if existing_links:
			
 
				+                self.log.emit(f"[*] 发现已有记录: {len(existing_links)} 条，将从新记录开始搜索...")
			
 
				+
			
 
				             self.log.emit(f"<b>[*] 任务启动: {self.keyword}</b>")
			
 
				             
			
 
				             def status_cb(is_waiting, msg):
			
@@ -39,22 +51,20 @@ class ScraperThread(QThread):
 
				             scraper = Scraper1688(headless=self.headless, status_callback=status_cb)
			
 
				             
			
 
				             # 使用流式生成器抓取
			
 
				-            total_target = 20
			
 
				-            # total_target = 200
			
 
				             collected_count = 0
			
 
				             
			
 
				-            for batch_results in scraper.search_products_yield(self.keyword, total_count=total_target):
			
 
				+            for batch_results in scraper.search_products_yield(self.keyword, total_count=self.total_count, existing_links=existing_links):
			
 
				                 # 实时写入 Excel (此时 batch_results 为 10 条或页末余数)
			
 
				-                append_to_template(batch_results, self.output_path)
			
 
				+                append_to_template(batch_results, self.output_path, status_callback=status_cb)
			
 
				                 
			
 
				                 collected_count += len(batch_results)
			
 
				-                self.log.emit(f"[+] 数据已持久化: {len(batch_results)} 条，当前总计: {collected_count}")
			
 
				+                self.log.emit(f"[+] 新增数据已持久化: {len(batch_results)} 条，本次共计: {collected_count}")
			
 
				                 
			
 
				-                prog = int((collected_count / total_target) * 100)
			
 
				+                prog = int((collected_count / self.total_count) * 100)
			
 
				                 self.progress.emit(min(prog, 100))
			
 
				             
			
 
				             duration = time.time() - start_time
			
 
				-            self.log.emit(f"<b>[完成] 任务结束，共抓取 {collected_count} 条数据。</b>")
			
 
				+            self.log.emit(f"<b>[完成] 任务结束，本次新增抓取 {collected_count} 条数据。</b>")
			
 
				             self.log.emit(f"<b>[耗时] 处理总时间: {duration:.2f} 秒</b>")
			
 
				             self.finished.emit("", scraper, duration)
			
 
				         except Exception as e:
			
@@ -76,6 +86,12 @@ class MainWindow(QMainWindow):
 
				     def initUI(self):
			
 
				         self.setWindowTitle("1688 产品信息实时抓取工具 v3.0")
			
 
				         self.setGeometry(100, 100, 1100, 750)
			
 
				+
			
 
				+        # 设置窗口图标
			
 
				+        icon_path = get_resource_path("app.ico")
			
 
				+        if os.path.exists(icon_path):
			
 
				+            self.setWindowIcon(QIcon(icon_path))
			
 
				+
			
 
				         central_widget = QWidget()
			
 
				         self.setCentralWidget(central_widget)
			
 
				         main_layout = QHBoxLayout(central_widget)
			
@@ -116,12 +132,37 @@ class MainWindow(QMainWindow):
 
				 
			
 
				         action_layout = QHBoxLayout()
			
 
				         self.category_display = QLabel("请选择二级类目")
			
 
				+        
			
 
				+        # 抓取数量配置
			
 
				+        count_layout = QHBoxLayout()
			
 
				+        self.count_spin = QSpinBox()
			
 
				+        self.count_spin.setRange(1, 10000)
			
 
				+        self.count_spin.setValue(200)
			
 
				+        self.count_spin.setFixedWidth(80)
			
 
				+        count_layout.addWidget(QLabel("抓取数量:"))
			
 
				+        count_layout.addWidget(self.count_spin)
			
 
				+        
			
 
				         self.search_btn = QPushButton("开始抓取")
			
 
				+        self.search_btn.setEnabled(False) # 初始置灰，直到选择类目和路径
			
 
				         self.search_btn.clicked.connect(self.start_scraping)
			
 
				         self.search_btn.setMinimumHeight(50)
			
 
				-        self.search_btn.setStyleSheet("QPushButton { background-color: #0078d4; color: white; font-weight: bold; font-size: 16px; }")
			
 
				+        self.search_btn.setStyleSheet("""
			
 
				+            QPushButton { 
			
 
				+                background-color: #0078d4; 
			
 
				+                color: white; 
			
 
				+                font-weight: bold; 
			
 
				+                font-size: 16px; 
			
 
				+                border-radius: 4px;
			
 
				+            }
			
 
				+            QPushButton:disabled { 
			
 
				+                background-color: #cccccc; 
			
 
				+                color: #888888; 
			
 
				+            }
			
 
				+        """)
			
 
				+        
			
 
				         action_layout.addWidget(QLabel("<font color='red'>*</font>检索类目:"))
			
 
				         action_layout.addWidget(self.category_display, 1)
			
 
				+        action_layout.addLayout(count_layout)
			
 
				         action_layout.addWidget(self.search_btn)
			
 
				         right_layout.addLayout(action_layout)
			
 
				 
			
@@ -178,6 +219,7 @@ class MainWindow(QMainWindow):
 
				             if self.output_base_path:
			
 
				                 full_p = os.path.normpath(os.path.join(self.output_base_path, "选品", self.selected_category_1, f"{self.selected_category_2}.xlsx"))
			
 
				                 self.path_display.setText(full_p)
			
 
				+                self.search_btn.setEnabled(True) # 仅在路径和类目都选好时启用按钮
			
 
				 
			
 
				     def select_output_path(self):
			
 
				         p = QFileDialog.getExistingDirectory(self, "选择保存根目录")
			
@@ -192,18 +234,17 @@ class MainWindow(QMainWindow):
 
				         file_path = os.path.normpath(os.path.join(target_dir, f"{self.selected_category_2}.xlsx"))
			
 
				         self.current_output_file = file_path # 记录当前文件用于最后打开
			
 
				         
			
 
				-        # 启动抓取前清理旧文件
			
 
				-        if os.path.exists(file_path):
			
 
				-            try: os.remove(file_path)
			
 
				-            except: pass
			
 
				+        # 启动抓取前不再删除旧文件，实现断点续爬功能
			
 
				 
			
 
				         self.search_btn.setEnabled(False)
			
 
				+        self.count_spin.setEnabled(False) # 任务开始后也禁用数量输入
			
 
				         self.status_label.setText("处理中……")
			
 
				         self.log_output.clear()
			
 
				         self.pbar.setValue(0)
			
 
				         headless = not self.show_browser_cb.isChecked()
			
 
				+        total_count = self.count_spin.value()
			
 
				         
			
 
				-        self.thread = ScraperThread(self.selected_category_2, file_path, headless)
			
 
				+        self.thread = ScraperThread(self.selected_category_2, file_path, total_count, headless)
			
 
				         self.thread.log.connect(self.log_output.append)
			
 
				         self.thread.progress.connect(self.pbar.setValue)
			
 
				         self.thread.finished.connect(self.on_finished)
			
@@ -211,6 +252,7 @@ class MainWindow(QMainWindow):
 
				 
			
 
				     def on_finished(self, err, scraper, duration):
			
 
				         self.search_btn.setEnabled(True)
			
 
				+        self.count_spin.setEnabled(True) # 任务结束后恢复数量输入
			
 
				         if scraper: self.active_scraper = scraper
			
 
				         
			
 
				         if not err:
			
--- a/src/scraper.py
+++ b/src/scraper.py
@@ -111,7 +111,7 @@ class Scraper1688:
 
				         return True
			
 
				 
			
 
				     # def search_products_yield(self, keyword, total_count=200):
			
 
				-    def search_products_yield(self, keyword, total_count=20):
			
 
				+    def search_products_yield(self, keyword, total_count=200, existing_links=None):
			
 
				         gbk_keyword = urllib.parse.quote(keyword, encoding='gbk')
			
 
				         base_url = f"https://s.1688.com/selloffer/offer_search.htm?keywords={gbk_keyword}&n=y&netType=1%2C11%2C16"
			
 
				         
			
@@ -119,12 +119,15 @@ class Scraper1688:
 
				         self.driver.get("https://www.1688.com")
			
 
				         self.check_for_captcha()
			
 
				 
			
 
				-        all_links = set()
			
 
				+        all_links = existing_links if existing_links is not None else set()
			
 
				         page = 1
			
 
				         consecutive_empty_pages = 0
			
 
				         
			
 
				-        while len(all_links) < total_count and consecutive_empty_pages < 3:
			
 
				-            print(f"[*] 正在抓取列表页: 第 {page} 页...")
			
 
				+        # 记录初始抓取的链接数，用于计算进度
			
 
				+        initial_count = len(all_links)
			
 
				+        
			
 
				+        while len(all_links) < total_count + initial_count and consecutive_empty_pages < 3:
			
 
				+            print(f"[*] 正在搜索列表页: 第 {page} 页...")
			
 
				             target_url = f"{base_url}&beginPage={page}&page={page}"
			
 
				             self.driver.get(target_url)
			
 
				             
			
@@ -168,7 +171,7 @@ class Scraper1688:
 
				                     # 详情页抓取后的随机等待
			
 
				                     time.sleep(random.uniform(2, 4))
			
 
				                     
			
 
				-                    if len(all_links) >= total_count:
			
 
				+                    if len(all_links) >= total_count + initial_count:
			
 
				                         break
			
 
				             
			
 
				             # 每页结束，将不足 10 条的余数 yield 出去
			
@@ -177,8 +180,8 @@ class Scraper1688:
 
				                 page_batch = []
			
 
				 
			
 
				             page += 1
			
 
				-            if len(all_links) < total_count:
			
 
				-                print(f"[*] 累计已处理: {len(all_links)} 条，准备翻下一页...")
			
 
				+            if len(all_links) < total_count + initial_count:
			
 
				+                print(f"[*] 累计已处理新链接: {len(all_links) - initial_count} 条，准备翻下一页...")
			
 
				                 time.sleep(3)
			
 
				 
			
 
				         return list(all_links)
			
@@ -263,7 +266,7 @@ class Scraper1688:
 
				                            or safe_text(By.CSS_SELECTOR, "div.company-name"),
			
 
				             }
			
 
				 
			
 
				-            # --- 核心逻辑：拆分颜色分类 ---
			
 
				+            # --- 核心逻辑：拆分规格/颜色分类 ---
			
 
				             sku_props = []
			
 
				             try:
			
 
				                 # 尝试多种路径获取 SKU 属性
			
@@ -272,21 +275,27 @@ class Scraper1688:
 
				                             model.get("sku", {}).get("skuProps", [])
			
 
				             except: pass
			
 
				 
			
 
				-            # 寻找“颜色分类”或类似的属性
			
 
				-            color_prop = next((p for p in sku_props if p.get("prop") in ["颜色", "颜色分类", "花色"]), None)
			
 
				+            # 智能寻找主维度：
			
 
				+            # 1. 优先找包含“颜色”、“分类”、“款式”、“花色”的维度
			
 
				+            # 2. 如果没有，则取第一个 SKU 维度（例如“净含量”、“规格”等）
			
 
				+            main_prop = None
			
 
				+            if sku_props:
			
 
				+                main_prop = next((p for p in sku_props if any(k in p.get("prop", "") for k in ["颜色", "分类", "款式", "花色"])), None)
			
 
				+                if not main_prop:
			
 
				+                    main_prop = sku_props[0]
			
 
				             
			
 
				-            if color_prop and color_prop.get("value"):
			
 
				+            if main_prop and main_prop.get("value"):
			
 
				                 variant_results = []
			
 
				-                for val in color_prop["value"]:
			
 
				+                for val in main_prop["value"]:
			
 
				                     # 只有当该分类确实有名字时才记录
			
 
				-                    c_name = val.get("name")
			
 
				-                    if c_name:
			
 
				+                    variant_name = val.get("name")
			
 
				+                    if variant_name:
			
 
				                         row = base_data.copy()
			
 
				-                        row["color"] = c_name
			
 
				+                        row["color"] = variant_name
			
 
				                         variant_results.append(row)
			
 
				                 return variant_results
			
 
				             else:
			
 
				-                # 兜底：如果没有 SKU 拆分，则尝试获取单属性颜色
			
 
				+                # 兜底：如果没有发现规格选择区，则获取单属性颜色
			
 
				                 base_data["color"] = get_attr("颜色") or get_attr("颜色分类") or ""
			
 
				                 return [base_data]