LuTong пре 3 месеци
родитељ
комит
e27b62117f
3 измењених фајлова са 114 додато и 38 уклоњено
  1. 30 5
      src/excel_handler.py
  2. 59 17
      src/gui.py
  3. 25 16
      src/scraper.py

+ 30 - 5
src/excel_handler.py

@@ -12,6 +12,32 @@ def get_resource_path(relative_path):
     base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
     return os.path.join(base_dir, relative_path)
 
+def get_existing_info(file_path):
+    """
+    读取已有文件中的链接和最后一行编码
+    """
+    links = set()
+    last_code = 0
+    if not os.path.exists(file_path):
+        return links, last_code
+    
+    try:
+        wb = load_workbook(file_path, data_only=True)
+        ws = wb.active
+        # 假设 A 列是编码,K 列是链接
+        for r in range(3, ws.max_row + 1):
+            code_val = ws.cell(row=r, column=1).value
+            link_val = ws.cell(row=r, column=11).value
+            
+            if link_val:
+                links.add(str(link_val).strip())
+            
+            if isinstance(code_val, (int, float)):
+                last_code = max(last_code, int(code_val))
+    except:
+        pass
+    return links, last_code
+
 def append_to_template(products, output_path, status_callback=None):
     """
     将产品数据追加写入到指定的 Excel 文件中。
@@ -36,13 +62,12 @@ def append_to_template(products, output_path, status_callback=None):
     
     ws = wb.active
     
-    # 3. 寻找起始行 (品类或名称为空的行)
+    # 3. 寻找起始行 (基于第 11 列“产品链接”进行判定,防止覆盖)
     start_row = 3
     for r in range(3, ws.max_row + 2):
-        val_cat = ws.cell(row=r, column=2).value
-        val_name = ws.cell(row=r, column=4).value
-        if (val_cat is None or str(val_cat).strip() == "") and \
-           (val_name is None or str(val_name).strip() == ""):
+        # 第 11 列是产品链接
+        val_link = ws.cell(row=r, column=11).value
+        if val_link is None or str(val_link).strip() == "":
             start_row = r
             break
     else:

+ 59 - 17
src/gui.py

@@ -5,12 +5,18 @@ import traceback
 import pandas as pd
 from PyQt6.QtWidgets import (QApplication, QMainWindow, QWidget, QVBoxLayout, 
                              QHBoxLayout, QLineEdit, QPushButton, QTextEdit, 
-                             QLabel, QFileDialog, QProgressBar, QTreeView, QSplitter, QCheckBox)
+                             QLabel, QFileDialog, QProgressBar, QTreeView, QSplitter, QCheckBox, QSpinBox)
 from PyQt6.QtCore import QThread, pyqtSignal, Qt
-from PyQt6.QtGui import QStandardItemModel, QStandardItem
+from PyQt6.QtGui import QStandardItemModel, QStandardItem, QIcon
 
 from src.scraper import Scraper1688
-from src.excel_handler import append_to_template
+from src.excel_handler import append_to_template, get_existing_info
+
+def get_resource_path(relative_path):
+    """ 获取资源绝对路径,兼容开发环境和 PyInstaller 打包环境 """
+    if hasattr(sys, '_MEIPASS'):
+        return os.path.join(sys._MEIPASS, relative_path)
+    return os.path.join(os.getcwd(), relative_path)
 
 class ScraperThread(QThread):
     progress = pyqtSignal(int)
@@ -18,16 +24,22 @@ class ScraperThread(QThread):
     # finished 信号增加耗时参数 (秒)
     finished = pyqtSignal(str, object, float)
 
-    def __init__(self, keyword, output_path, headless=True):
+    def __init__(self, keyword, output_path, total_count, headless=True):
         super().__init__()
         self.keyword = keyword
         self.output_path = output_path
+        self.total_count = total_count
         self.headless = headless
 
     def run(self):
         scraper = None
         start_time = time.time()
         try:
+            # 读取已抓取的链接,实现断点续爬
+            existing_links, _ = get_existing_info(self.output_path)
+            if existing_links:
+                self.log.emit(f"[*] 发现已有记录: {len(existing_links)} 条,将从新记录开始搜索...")
+
             self.log.emit(f"<b>[*] 任务启动: {self.keyword}</b>")
             
             def status_cb(is_waiting, msg):
@@ -39,22 +51,20 @@ class ScraperThread(QThread):
             scraper = Scraper1688(headless=self.headless, status_callback=status_cb)
             
             # 使用流式生成器抓取
-            total_target = 20
-            # total_target = 200
             collected_count = 0
             
-            for batch_results in scraper.search_products_yield(self.keyword, total_count=total_target):
+            for batch_results in scraper.search_products_yield(self.keyword, total_count=self.total_count, existing_links=existing_links):
                 # 实时写入 Excel (此时 batch_results 为 10 条或页末余数)
-                append_to_template(batch_results, self.output_path)
+                append_to_template(batch_results, self.output_path, status_callback=status_cb)
                 
                 collected_count += len(batch_results)
-                self.log.emit(f"[+] 数据已持久化: {len(batch_results)} 条,当前总计: {collected_count}")
+                self.log.emit(f"[+] 新增数据已持久化: {len(batch_results)} 条,本次共计: {collected_count}")
                 
-                prog = int((collected_count / total_target) * 100)
+                prog = int((collected_count / self.total_count) * 100)
                 self.progress.emit(min(prog, 100))
             
             duration = time.time() - start_time
-            self.log.emit(f"<b>[完成] 任务结束,抓取 {collected_count} 条数据。</b>")
+            self.log.emit(f"<b>[完成] 任务结束,本次新增抓取 {collected_count} 条数据。</b>")
             self.log.emit(f"<b>[耗时] 处理总时间: {duration:.2f} 秒</b>")
             self.finished.emit("", scraper, duration)
         except Exception as e:
@@ -76,6 +86,12 @@ class MainWindow(QMainWindow):
     def initUI(self):
         self.setWindowTitle("1688 产品信息实时抓取工具 v3.0")
         self.setGeometry(100, 100, 1100, 750)
+
+        # 设置窗口图标
+        icon_path = get_resource_path("app.ico")
+        if os.path.exists(icon_path):
+            self.setWindowIcon(QIcon(icon_path))
+
         central_widget = QWidget()
         self.setCentralWidget(central_widget)
         main_layout = QHBoxLayout(central_widget)
@@ -116,12 +132,37 @@ class MainWindow(QMainWindow):
 
         action_layout = QHBoxLayout()
         self.category_display = QLabel("请选择二级类目")
+        
+        # 抓取数量配置
+        count_layout = QHBoxLayout()
+        self.count_spin = QSpinBox()
+        self.count_spin.setRange(1, 10000)
+        self.count_spin.setValue(200)
+        self.count_spin.setFixedWidth(80)
+        count_layout.addWidget(QLabel("抓取数量:"))
+        count_layout.addWidget(self.count_spin)
+        
         self.search_btn = QPushButton("开始抓取")
+        self.search_btn.setEnabled(False) # 初始置灰,直到选择类目和路径
         self.search_btn.clicked.connect(self.start_scraping)
         self.search_btn.setMinimumHeight(50)
-        self.search_btn.setStyleSheet("QPushButton { background-color: #0078d4; color: white; font-weight: bold; font-size: 16px; }")
+        self.search_btn.setStyleSheet("""
+            QPushButton { 
+                background-color: #0078d4; 
+                color: white; 
+                font-weight: bold; 
+                font-size: 16px; 
+                border-radius: 4px;
+            }
+            QPushButton:disabled { 
+                background-color: #cccccc; 
+                color: #888888; 
+            }
+        """)
+        
         action_layout.addWidget(QLabel("<font color='red'>*</font>检索类目:"))
         action_layout.addWidget(self.category_display, 1)
+        action_layout.addLayout(count_layout)
         action_layout.addWidget(self.search_btn)
         right_layout.addLayout(action_layout)
 
@@ -178,6 +219,7 @@ class MainWindow(QMainWindow):
             if self.output_base_path:
                 full_p = os.path.normpath(os.path.join(self.output_base_path, "选品", self.selected_category_1, f"{self.selected_category_2}.xlsx"))
                 self.path_display.setText(full_p)
+                self.search_btn.setEnabled(True) # 仅在路径和类目都选好时启用按钮
 
     def select_output_path(self):
         p = QFileDialog.getExistingDirectory(self, "选择保存根目录")
@@ -192,18 +234,17 @@ class MainWindow(QMainWindow):
         file_path = os.path.normpath(os.path.join(target_dir, f"{self.selected_category_2}.xlsx"))
         self.current_output_file = file_path # 记录当前文件用于最后打开
         
-        # 启动抓取前清理旧文件
-        if os.path.exists(file_path):
-            try: os.remove(file_path)
-            except: pass
+        # 启动抓取前不再删除旧文件,实现断点续爬功能
 
         self.search_btn.setEnabled(False)
+        self.count_spin.setEnabled(False) # 任务开始后也禁用数量输入
         self.status_label.setText("处理中……")
         self.log_output.clear()
         self.pbar.setValue(0)
         headless = not self.show_browser_cb.isChecked()
+        total_count = self.count_spin.value()
         
-        self.thread = ScraperThread(self.selected_category_2, file_path, headless)
+        self.thread = ScraperThread(self.selected_category_2, file_path, total_count, headless)
         self.thread.log.connect(self.log_output.append)
         self.thread.progress.connect(self.pbar.setValue)
         self.thread.finished.connect(self.on_finished)
@@ -211,6 +252,7 @@ class MainWindow(QMainWindow):
 
     def on_finished(self, err, scraper, duration):
         self.search_btn.setEnabled(True)
+        self.count_spin.setEnabled(True) # 任务结束后恢复数量输入
         if scraper: self.active_scraper = scraper
         
         if not err:

+ 25 - 16
src/scraper.py

@@ -111,7 +111,7 @@ class Scraper1688:
         return True
 
     # def search_products_yield(self, keyword, total_count=200):
-    def search_products_yield(self, keyword, total_count=20):
+    def search_products_yield(self, keyword, total_count=200, existing_links=None):
         gbk_keyword = urllib.parse.quote(keyword, encoding='gbk')
         base_url = f"https://s.1688.com/selloffer/offer_search.htm?keywords={gbk_keyword}&n=y&netType=1%2C11%2C16"
         
@@ -119,12 +119,15 @@ class Scraper1688:
         self.driver.get("https://www.1688.com")
         self.check_for_captcha()
 
-        all_links = set()
+        all_links = existing_links if existing_links is not None else set()
         page = 1
         consecutive_empty_pages = 0
         
-        while len(all_links) < total_count and consecutive_empty_pages < 3:
-            print(f"[*] 正在抓取列表页: 第 {page} 页...")
+        # 记录初始抓取的链接数,用于计算进度
+        initial_count = len(all_links)
+        
+        while len(all_links) < total_count + initial_count and consecutive_empty_pages < 3:
+            print(f"[*] 正在搜索列表页: 第 {page} 页...")
             target_url = f"{base_url}&beginPage={page}&page={page}"
             self.driver.get(target_url)
             
@@ -168,7 +171,7 @@ class Scraper1688:
                     # 详情页抓取后的随机等待
                     time.sleep(random.uniform(2, 4))
                     
-                    if len(all_links) >= total_count:
+                    if len(all_links) >= total_count + initial_count:
                         break
             
             # 每页结束,将不足 10 条的余数 yield 出去
@@ -177,8 +180,8 @@ class Scraper1688:
                 page_batch = []
 
             page += 1
-            if len(all_links) < total_count:
-                print(f"[*] 累计已处理: {len(all_links)} 条,准备翻下一页...")
+            if len(all_links) < total_count + initial_count:
+                print(f"[*] 累计已处理新链接: {len(all_links) - initial_count} 条,准备翻下一页...")
                 time.sleep(3)
 
         return list(all_links)
@@ -263,7 +266,7 @@ class Scraper1688:
                            or safe_text(By.CSS_SELECTOR, "div.company-name"),
             }
 
-            # --- 核心逻辑:拆分颜色分类 ---
+            # --- 核心逻辑:拆分规格/颜色分类 ---
             sku_props = []
             try:
                 # 尝试多种路径获取 SKU 属性
@@ -272,21 +275,27 @@ class Scraper1688:
                             model.get("sku", {}).get("skuProps", [])
             except: pass
 
-            # 寻找“颜色分类”或类似的属性
-            color_prop = next((p for p in sku_props if p.get("prop") in ["颜色", "颜色分类", "花色"]), None)
+            # 智能寻找主维度:
+            # 1. 优先找包含“颜色”、“分类”、“款式”、“花色”的维度
+            # 2. 如果没有,则取第一个 SKU 维度(例如“净含量”、“规格”等)
+            main_prop = None
+            if sku_props:
+                main_prop = next((p for p in sku_props if any(k in p.get("prop", "") for k in ["颜色", "分类", "款式", "花色"])), None)
+                if not main_prop:
+                    main_prop = sku_props[0]
             
-            if color_prop and color_prop.get("value"):
+            if main_prop and main_prop.get("value"):
                 variant_results = []
-                for val in color_prop["value"]:
+                for val in main_prop["value"]:
                     # 只有当该分类确实有名字时才记录
-                    c_name = val.get("name")
-                    if c_name:
+                    variant_name = val.get("name")
+                    if variant_name:
                         row = base_data.copy()
-                        row["color"] = c_name
+                        row["color"] = variant_name
                         variant_results.append(row)
                 return variant_results
             else:
-                # 兜底:如果没有 SKU 拆分,则尝试获取单属性颜色
+                # 兜底:如果没有发现规格选择区,则获取单属性颜色
                 base_data["color"] = get_attr("颜色") or get_attr("颜色分类") or ""
                 return [base_data]