il y a 3 mois · e27b62117f
--- a/src/excel_handler.py
+++ b/src/excel_handler.py
@@ -12,6 +12,32 @@ def get_resource_path(relative_path):
 
															     base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
														
 
															     return os.path.join(base_dir, relative_path)
														
 
															+def get_existing_info(file_path):
														
 
															+    """
														
 
															+    读取已有文件中的链接和最后一行编码
														
 
															+    """
														
 
															+    links = set()
														
 
															+    last_code = 0
														
 
															+    if not os.path.exists(file_path):
														
 
															+        return links, last_code
														
 
															+    
														
 
															+    try:
														
 
															+        wb = load_workbook(file_path, data_only=True)
														
 
															+        ws = wb.active
														
 
															+        # 假设 A 列是编码，K 列是链接
														
 
															+        for r in range(3, ws.max_row + 1):
														
 
															+            code_val = ws.cell(row=r, column=1).value
														
 
															+            link_val = ws.cell(row=r, column=11).value
														
 
															+            
														
 
															+            if link_val:
														
 
															+                links.add(str(link_val).strip())
														
 
															+            
														
 
															+            if isinstance(code_val, (int, float)):
														
 
															+                last_code = max(last_code, int(code_val))
														
 
															+    except:
														
 
															+        pass
														
 
															+    return links, last_code
														
 
															+
														
 
															 def append_to_template(products, output_path, status_callback=None):
														
 
															     """
														
 
															     将产品数据追加写入到指定的 Excel 文件中。
														
@@ -36,13 +62,12 @@ def append_to_template(products, output_path, status_callback=None):
 
															     ws = wb.active
														
 
															-    # 3. 寻找起始行 (品类或名称为空的行)
														
 
															+    # 3. 寻找起始行 (基于第 11 列“产品链接”进行判定，防止覆盖)
														
 
															     start_row = 3
														
 
															     for r in range(3, ws.max_row + 2):
														
 
															-        val_cat = ws.cell(row=r, column=2).value
														
 
															-        val_name = ws.cell(row=r, column=4).value
														
 
															-        if (val_cat is None or str(val_cat).strip() == "") and \
														
 
															-           (val_name is None or str(val_name).strip() == ""):
														
 
															+        # 第 11 列是产品链接
														
 
															+        val_link = ws.cell(row=r, column=11).value
														
 
															+        if val_link is None or str(val_link).strip() == "":
														
 
															             start_row = r
														
 
															             break
														
 
															     else:
														
--- a/src/gui.py
+++ b/src/gui.py
@@ -5,12 +5,18 @@ import traceback
 
															 import pandas as pd
														
 
															 from PyQt6.QtWidgets import (QApplication, QMainWindow, QWidget, QVBoxLayout, 
														
 
															                              QHBoxLayout, QLineEdit, QPushButton, QTextEdit, 
														
 
															-                             QLabel, QFileDialog, QProgressBar, QTreeView, QSplitter, QCheckBox)
														
 
															+                             QLabel, QFileDialog, QProgressBar, QTreeView, QSplitter, QCheckBox, QSpinBox)
														
 
															 from PyQt6.QtCore import QThread, pyqtSignal, Qt
														
 
															-from PyQt6.QtGui import QStandardItemModel, QStandardItem
														
 
															+from PyQt6.QtGui import QStandardItemModel, QStandardItem, QIcon
														
 
															 from src.scraper import Scraper1688
														
 
															-from src.excel_handler import append_to_template
														
 
															+from src.excel_handler import append_to_template, get_existing_info
														
 
															+
														
 
															+def get_resource_path(relative_path):
														
 
															+    """ 获取资源绝对路径，兼容开发环境和 PyInstaller 打包环境 """
														
 
															+    if hasattr(sys, '_MEIPASS'):
														
 
															+        return os.path.join(sys._MEIPASS, relative_path)
														
 
															+    return os.path.join(os.getcwd(), relative_path)
														
 
															 class ScraperThread(QThread):
														
 
															     progress = pyqtSignal(int)
														
@@ -18,16 +24,22 @@ class ScraperThread(QThread):
 
															     # finished 信号增加耗时参数 (秒)
														
 
															     finished = pyqtSignal(str, object, float)
														
 
															-    def __init__(self, keyword, output_path, headless=True):
														
 
															+    def __init__(self, keyword, output_path, total_count, headless=True):
														
 
															         super().__init__()
														
 
															         self.keyword = keyword
														
 
															         self.output_path = output_path
														
 
															+        self.total_count = total_count
														
 
															         self.headless = headless
														
 
															     def run(self):
														
 
															         scraper = None
														
 
															         start_time = time.time()
														
 
															         try:
														
 
															+            # 读取已抓取的链接，实现断点续爬
														
 
															+            existing_links, _ = get_existing_info(self.output_path)
														
 
															+            if existing_links:
														
 
															+                self.log.emit(f"[*] 发现已有记录: {len(existing_links)} 条，将从新记录开始搜索...")
														
 
															+
														
 
															             self.log.emit(f"<b>[*] 任务启动: {self.keyword}</b>")
														
 
															             def status_cb(is_waiting, msg):
														
@@ -39,22 +51,20 @@ class ScraperThread(QThread):
 
															             scraper = Scraper1688(headless=self.headless, status_callback=status_cb)
														
 
															             # 使用流式生成器抓取
														
 
															-            total_target = 20
														
 
															-            # total_target = 200
														
 
															             collected_count = 0
														
 
															-            for batch_results in scraper.search_products_yield(self.keyword, total_count=total_target):
														
 
															+            for batch_results in scraper.search_products_yield(self.keyword, total_count=self.total_count, existing_links=existing_links):
														
 
															                 # 实时写入 Excel (此时 batch_results 为 10 条或页末余数)
														
 
															-                append_to_template(batch_results, self.output_path)
														
 
															+                append_to_template(batch_results, self.output_path, status_callback=status_cb)
														
 
															                 collected_count += len(batch_results)
														
 
															-                self.log.emit(f"[+] 数据已持久化: {len(batch_results)} 条，当前总计: {collected_count}")
														
 
															+                self.log.emit(f"[+] 新增数据已持久化: {len(batch_results)} 条，本次共计: {collected_count}")
														
 
															-                prog = int((collected_count / total_target) * 100)
														
 
															+                prog = int((collected_count / self.total_count) * 100)
														
 
															                 self.progress.emit(min(prog, 100))
														
 
															             duration = time.time() - start_time
														
 
															-            self.log.emit(f"<b>[完成] 任务结束，共抓取 {collected_count} 条数据。</b>")
														
 
															+            self.log.emit(f"<b>[完成] 任务结束，本次新增抓取 {collected_count} 条数据。</b>")
														
 
															             self.log.emit(f"<b>[耗时] 处理总时间: {duration:.2f} 秒</b>")
														
 
															             self.finished.emit("", scraper, duration)
														
 
															         except Exception as e:
														
@@ -76,6 +86,12 @@ class MainWindow(QMainWindow):
 
															     def initUI(self):
														
 
															         self.setWindowTitle("1688 产品信息实时抓取工具 v3.0")
														
 
															         self.setGeometry(100, 100, 1100, 750)
														
 
															+
														
 
															+        # 设置窗口图标
														
 
															+        icon_path = get_resource_path("app.ico")
														
 
															+        if os.path.exists(icon_path):
														
 
															+            self.setWindowIcon(QIcon(icon_path))
														
 
															+
														
 
															         central_widget = QWidget()
														
 
															         self.setCentralWidget(central_widget)
														
 
															         main_layout = QHBoxLayout(central_widget)
														
@@ -116,12 +132,37 @@ class MainWindow(QMainWindow):
 
															         action_layout = QHBoxLayout()
														
 
															         self.category_display = QLabel("请选择二级类目")
														
 
															+        
														
 
															+        # 抓取数量配置
														
 
															+        count_layout = QHBoxLayout()
														
 
															+        self.count_spin = QSpinBox()
														
 
															+        self.count_spin.setRange(1, 10000)
														
 
															+        self.count_spin.setValue(200)
														
 
															+        self.count_spin.setFixedWidth(80)
														
 
															+        count_layout.addWidget(QLabel("抓取数量:"))
														
 
															+        count_layout.addWidget(self.count_spin)
														
 
															+        
														
 
															         self.search_btn = QPushButton("开始抓取")
														
 
															+        self.search_btn.setEnabled(False) # 初始置灰，直到选择类目和路径
														
 
															         self.search_btn.clicked.connect(self.start_scraping)
														
 
															         self.search_btn.setMinimumHeight(50)
														
 
															-        self.search_btn.setStyleSheet("QPushButton { background-color: #0078d4; color: white; font-weight: bold; font-size: 16px; }")
														
 
															+        self.search_btn.setStyleSheet("""
														
 
															+            QPushButton { 
														
 
															+                background-color: #0078d4; 
														
 
															+                color: white; 
														
 
															+                font-weight: bold; 
														
 
															+                font-size: 16px; 
														
 
															+                border-radius: 4px;
														
 
															+            }
														
 
															+            QPushButton:disabled { 
														
 
															+                background-color: #cccccc; 
														
 
															+                color: #888888; 
														
 
															+            }
														
 
															+        """)
														
 
															+        
														
 
															         action_layout.addWidget(QLabel("<font color='red'>*</font>检索类目:"))
														
 
															         action_layout.addWidget(self.category_display, 1)
														
 
															+        action_layout.addLayout(count_layout)
														
 
															         action_layout.addWidget(self.search_btn)
														
 
															         right_layout.addLayout(action_layout)
														
@@ -178,6 +219,7 @@ class MainWindow(QMainWindow):
 
															             if self.output_base_path:
														
 
															                 full_p = os.path.normpath(os.path.join(self.output_base_path, "选品", self.selected_category_1, f"{self.selected_category_2}.xlsx"))
														
 
															                 self.path_display.setText(full_p)
														
 
															+                self.search_btn.setEnabled(True) # 仅在路径和类目都选好时启用按钮
														
 
															     def select_output_path(self):
														
 
															         p = QFileDialog.getExistingDirectory(self, "选择保存根目录")
														
@@ -192,18 +234,17 @@ class MainWindow(QMainWindow):
 
															         file_path = os.path.normpath(os.path.join(target_dir, f"{self.selected_category_2}.xlsx"))
														
 
															         self.current_output_file = file_path # 记录当前文件用于最后打开
														
 
															-        # 启动抓取前清理旧文件
														
 
															-        if os.path.exists(file_path):
														
 
															-            try: os.remove(file_path)
														
 
															-            except: pass
														
 
															+        # 启动抓取前不再删除旧文件，实现断点续爬功能
														
 
															         self.search_btn.setEnabled(False)
														
 
															+        self.count_spin.setEnabled(False) # 任务开始后也禁用数量输入
														
 
															         self.status_label.setText("处理中……")
														
 
															         self.log_output.clear()
														
 
															         self.pbar.setValue(0)
														
 
															         headless = not self.show_browser_cb.isChecked()
														
 
															+        total_count = self.count_spin.value()
														
 
															-        self.thread = ScraperThread(self.selected_category_2, file_path, headless)
														
 
															+        self.thread = ScraperThread(self.selected_category_2, file_path, total_count, headless)
														
 
															         self.thread.log.connect(self.log_output.append)
														
 
															         self.thread.progress.connect(self.pbar.setValue)
														
 
															         self.thread.finished.connect(self.on_finished)
														
@@ -211,6 +252,7 @@ class MainWindow(QMainWindow):
 
															     def on_finished(self, err, scraper, duration):
														
 
															         self.search_btn.setEnabled(True)
														
 
															+        self.count_spin.setEnabled(True) # 任务结束后恢复数量输入
														
 
															         if scraper: self.active_scraper = scraper
														
 
															         if not err:
														
--- a/src/scraper.py
+++ b/src/scraper.py
@@ -111,7 +111,7 @@ class Scraper1688:
 
															         return True
														
 
															     # def search_products_yield(self, keyword, total_count=200):
														
 
															-    def search_products_yield(self, keyword, total_count=20):
														
 
															+    def search_products_yield(self, keyword, total_count=200, existing_links=None):
														
 
															         gbk_keyword = urllib.parse.quote(keyword, encoding='gbk')
														
 
															         base_url = f"https://s.1688.com/selloffer/offer_search.htm?keywords={gbk_keyword}&n=y&netType=1%2C11%2C16"
														
@@ -119,12 +119,15 @@ class Scraper1688:
 
															         self.driver.get("https://www.1688.com")
														
 
															         self.check_for_captcha()
														
 
															-        all_links = set()
														
 
															+        all_links = existing_links if existing_links is not None else set()
														
 
															         page = 1
														
 
															         consecutive_empty_pages = 0
														
 
															-        while len(all_links) < total_count and consecutive_empty_pages < 3:
														
 
															-            print(f"[*] 正在抓取列表页: 第 {page} 页...")
														
 
															+        # 记录初始抓取的链接数，用于计算进度
														
 
															+        initial_count = len(all_links)
														
 
															+        
														
 
															+        while len(all_links) < total_count + initial_count and consecutive_empty_pages < 3:
														
 
															+            print(f"[*] 正在搜索列表页: 第 {page} 页...")
														
 
															             target_url = f"{base_url}&beginPage={page}&page={page}"
														
 
															             self.driver.get(target_url)
														
@@ -168,7 +171,7 @@ class Scraper1688:
 
															                     # 详情页抓取后的随机等待
														
 
															                     time.sleep(random.uniform(2, 4))
														
 
															-                    if len(all_links) >= total_count:
														
 
															+                    if len(all_links) >= total_count + initial_count:
														
 
															                         break
														
 
															             # 每页结束，将不足 10 条的余数 yield 出去
														
@@ -177,8 +180,8 @@ class Scraper1688:
 
															                 page_batch = []
														
 
															             page += 1
														
 
															-            if len(all_links) < total_count:
														
 
															-                print(f"[*] 累计已处理: {len(all_links)} 条，准备翻下一页...")
														
 
															+            if len(all_links) < total_count + initial_count:
														
 
															+                print(f"[*] 累计已处理新链接: {len(all_links) - initial_count} 条，准备翻下一页...")
														
 
															                 time.sleep(3)
														
 
															         return list(all_links)
														
@@ -263,7 +266,7 @@ class Scraper1688:
 
															                            or safe_text(By.CSS_SELECTOR, "div.company-name"),
														
 
															             }
														
 
															-            # --- 核心逻辑：拆分颜色分类 ---
														
 
															+            # --- 核心逻辑：拆分规格/颜色分类 ---
														
 
															             sku_props = []
														
 
															             try:
														
 
															                 # 尝试多种路径获取 SKU 属性
														
@@ -272,21 +275,27 @@ class Scraper1688:
 
															                             model.get("sku", {}).get("skuProps", [])
														
 
															             except: pass
														
 
															-            # 寻找“颜色分类”或类似的属性
														
 
															-            color_prop = next((p for p in sku_props if p.get("prop") in ["颜色", "颜色分类", "花色"]), None)
														
 
															+            # 智能寻找主维度：
														
 
															+            # 1. 优先找包含“颜色”、“分类”、“款式”、“花色”的维度
														
 
															+            # 2. 如果没有，则取第一个 SKU 维度（例如“净含量”、“规格”等）
														
 
															+            main_prop = None
														
 
															+            if sku_props:
														
 
															+                main_prop = next((p for p in sku_props if any(k in p.get("prop", "") for k in ["颜色", "分类", "款式", "花色"])), None)
														
 
															+                if not main_prop:
														
 
															+                    main_prop = sku_props[0]
														
 
															-            if color_prop and color_prop.get("value"):
														
 
															+            if main_prop and main_prop.get("value"):
														
 
															                 variant_results = []
														
 
															-                for val in color_prop["value"]:
														
 
															+                for val in main_prop["value"]:
														
 
															                     # 只有当该分类确实有名字时才记录
														
 
															-                    c_name = val.get("name")
														
 
															-                    if c_name:
														
 
															+                    variant_name = val.get("name")
														
 
															+                    if variant_name:
														
 
															                         row = base_data.copy()
														
 
															-                        row["color"] = c_name
														
 
															+                        row["color"] = variant_name
														
 
															                         variant_results.append(row)
														
 
															                 return variant_results
														
 
															             else:
														
 
															-                # 兜底：如果没有 SKU 拆分，则尝试获取单属性颜色
														
 
															+                # 兜底：如果没有发现规格选择区，则获取单属性颜色
														
 
															                 base_data["color"] = get_attr("颜色") or get_attr("颜色分类") or ""
														
 
															                 return [base_data]