LuTong il y a 3 mois
Parent
commit
e27b62117f
3 fichiers modifiés avec 114 ajouts et 38 suppressions
  1. 30 5
      src/excel_handler.py
  2. 59 17
      src/gui.py
  3. 25 16
      src/scraper.py

+ 30 - 5
src/excel_handler.py

@@ -12,6 +12,32 @@ def get_resource_path(relative_path):
     base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
     base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
     return os.path.join(base_dir, relative_path)
     return os.path.join(base_dir, relative_path)
 
 
+def get_existing_info(file_path):
+    """
+    读取已有文件中的链接和最后一行编码
+    """
+    links = set()
+    last_code = 0
+    if not os.path.exists(file_path):
+        return links, last_code
+    
+    try:
+        wb = load_workbook(file_path, data_only=True)
+        ws = wb.active
+        # 假设 A 列是编码,K 列是链接
+        for r in range(3, ws.max_row + 1):
+            code_val = ws.cell(row=r, column=1).value
+            link_val = ws.cell(row=r, column=11).value
+            
+            if link_val:
+                links.add(str(link_val).strip())
+            
+            if isinstance(code_val, (int, float)):
+                last_code = max(last_code, int(code_val))
+    except:
+        pass
+    return links, last_code
+
 def append_to_template(products, output_path, status_callback=None):
 def append_to_template(products, output_path, status_callback=None):
     """
     """
     将产品数据追加写入到指定的 Excel 文件中。
     将产品数据追加写入到指定的 Excel 文件中。
@@ -36,13 +62,12 @@ def append_to_template(products, output_path, status_callback=None):
     
     
     ws = wb.active
     ws = wb.active
     
     
-    # 3. 寻找起始行 (品类或名称为空的行)
+    # 3. 寻找起始行 (基于第 11 列“产品链接”进行判定,防止覆盖)
     start_row = 3
     start_row = 3
     for r in range(3, ws.max_row + 2):
     for r in range(3, ws.max_row + 2):
-        val_cat = ws.cell(row=r, column=2).value
-        val_name = ws.cell(row=r, column=4).value
-        if (val_cat is None or str(val_cat).strip() == "") and \
-           (val_name is None or str(val_name).strip() == ""):
+        # 第 11 列是产品链接
+        val_link = ws.cell(row=r, column=11).value
+        if val_link is None or str(val_link).strip() == "":
             start_row = r
             start_row = r
             break
             break
     else:
     else:

+ 59 - 17
src/gui.py

@@ -5,12 +5,18 @@ import traceback
 import pandas as pd
 import pandas as pd
 from PyQt6.QtWidgets import (QApplication, QMainWindow, QWidget, QVBoxLayout, 
 from PyQt6.QtWidgets import (QApplication, QMainWindow, QWidget, QVBoxLayout, 
                              QHBoxLayout, QLineEdit, QPushButton, QTextEdit, 
                              QHBoxLayout, QLineEdit, QPushButton, QTextEdit, 
-                             QLabel, QFileDialog, QProgressBar, QTreeView, QSplitter, QCheckBox)
+                             QLabel, QFileDialog, QProgressBar, QTreeView, QSplitter, QCheckBox, QSpinBox)
 from PyQt6.QtCore import QThread, pyqtSignal, Qt
 from PyQt6.QtCore import QThread, pyqtSignal, Qt
-from PyQt6.QtGui import QStandardItemModel, QStandardItem
+from PyQt6.QtGui import QStandardItemModel, QStandardItem, QIcon
 
 
 from src.scraper import Scraper1688
 from src.scraper import Scraper1688
-from src.excel_handler import append_to_template
+from src.excel_handler import append_to_template, get_existing_info
+
+def get_resource_path(relative_path):
+    """ 获取资源绝对路径,兼容开发环境和 PyInstaller 打包环境 """
+    if hasattr(sys, '_MEIPASS'):
+        return os.path.join(sys._MEIPASS, relative_path)
+    return os.path.join(os.getcwd(), relative_path)
 
 
 class ScraperThread(QThread):
 class ScraperThread(QThread):
     progress = pyqtSignal(int)
     progress = pyqtSignal(int)
@@ -18,16 +24,22 @@ class ScraperThread(QThread):
     # finished 信号增加耗时参数 (秒)
     # finished 信号增加耗时参数 (秒)
     finished = pyqtSignal(str, object, float)
     finished = pyqtSignal(str, object, float)
 
 
-    def __init__(self, keyword, output_path, headless=True):
+    def __init__(self, keyword, output_path, total_count, headless=True):
         super().__init__()
         super().__init__()
         self.keyword = keyword
         self.keyword = keyword
         self.output_path = output_path
         self.output_path = output_path
+        self.total_count = total_count
         self.headless = headless
         self.headless = headless
 
 
     def run(self):
     def run(self):
         scraper = None
         scraper = None
         start_time = time.time()
         start_time = time.time()
         try:
         try:
+            # 读取已抓取的链接,实现断点续爬
+            existing_links, _ = get_existing_info(self.output_path)
+            if existing_links:
+                self.log.emit(f"[*] 发现已有记录: {len(existing_links)} 条,将从新记录开始搜索...")
+
             self.log.emit(f"<b>[*] 任务启动: {self.keyword}</b>")
             self.log.emit(f"<b>[*] 任务启动: {self.keyword}</b>")
             
             
             def status_cb(is_waiting, msg):
             def status_cb(is_waiting, msg):
@@ -39,22 +51,20 @@ class ScraperThread(QThread):
             scraper = Scraper1688(headless=self.headless, status_callback=status_cb)
             scraper = Scraper1688(headless=self.headless, status_callback=status_cb)
             
             
             # 使用流式生成器抓取
             # 使用流式生成器抓取
-            total_target = 20
-            # total_target = 200
             collected_count = 0
             collected_count = 0
             
             
-            for batch_results in scraper.search_products_yield(self.keyword, total_count=total_target):
+            for batch_results in scraper.search_products_yield(self.keyword, total_count=self.total_count, existing_links=existing_links):
                 # 实时写入 Excel (此时 batch_results 为 10 条或页末余数)
                 # 实时写入 Excel (此时 batch_results 为 10 条或页末余数)
-                append_to_template(batch_results, self.output_path)
+                append_to_template(batch_results, self.output_path, status_callback=status_cb)
                 
                 
                 collected_count += len(batch_results)
                 collected_count += len(batch_results)
-                self.log.emit(f"[+] 数据已持久化: {len(batch_results)} 条,当前总计: {collected_count}")
+                self.log.emit(f"[+] 新增数据已持久化: {len(batch_results)} 条,本次共计: {collected_count}")
                 
                 
-                prog = int((collected_count / total_target) * 100)
+                prog = int((collected_count / self.total_count) * 100)
                 self.progress.emit(min(prog, 100))
                 self.progress.emit(min(prog, 100))
             
             
             duration = time.time() - start_time
             duration = time.time() - start_time
-            self.log.emit(f"<b>[完成] 任务结束,抓取 {collected_count} 条数据。</b>")
+            self.log.emit(f"<b>[完成] 任务结束,本次新增抓取 {collected_count} 条数据。</b>")
             self.log.emit(f"<b>[耗时] 处理总时间: {duration:.2f} 秒</b>")
             self.log.emit(f"<b>[耗时] 处理总时间: {duration:.2f} 秒</b>")
             self.finished.emit("", scraper, duration)
             self.finished.emit("", scraper, duration)
         except Exception as e:
         except Exception as e:
@@ -76,6 +86,12 @@ class MainWindow(QMainWindow):
     def initUI(self):
     def initUI(self):
         self.setWindowTitle("1688 产品信息实时抓取工具 v3.0")
         self.setWindowTitle("1688 产品信息实时抓取工具 v3.0")
         self.setGeometry(100, 100, 1100, 750)
         self.setGeometry(100, 100, 1100, 750)
+
+        # 设置窗口图标
+        icon_path = get_resource_path("app.ico")
+        if os.path.exists(icon_path):
+            self.setWindowIcon(QIcon(icon_path))
+
         central_widget = QWidget()
         central_widget = QWidget()
         self.setCentralWidget(central_widget)
         self.setCentralWidget(central_widget)
         main_layout = QHBoxLayout(central_widget)
         main_layout = QHBoxLayout(central_widget)
@@ -116,12 +132,37 @@ class MainWindow(QMainWindow):
 
 
         action_layout = QHBoxLayout()
         action_layout = QHBoxLayout()
         self.category_display = QLabel("请选择二级类目")
         self.category_display = QLabel("请选择二级类目")
+        
+        # 抓取数量配置
+        count_layout = QHBoxLayout()
+        self.count_spin = QSpinBox()
+        self.count_spin.setRange(1, 10000)
+        self.count_spin.setValue(200)
+        self.count_spin.setFixedWidth(80)
+        count_layout.addWidget(QLabel("抓取数量:"))
+        count_layout.addWidget(self.count_spin)
+        
         self.search_btn = QPushButton("开始抓取")
         self.search_btn = QPushButton("开始抓取")
+        self.search_btn.setEnabled(False) # 初始置灰,直到选择类目和路径
         self.search_btn.clicked.connect(self.start_scraping)
         self.search_btn.clicked.connect(self.start_scraping)
         self.search_btn.setMinimumHeight(50)
         self.search_btn.setMinimumHeight(50)
-        self.search_btn.setStyleSheet("QPushButton { background-color: #0078d4; color: white; font-weight: bold; font-size: 16px; }")
+        self.search_btn.setStyleSheet("""
+            QPushButton { 
+                background-color: #0078d4; 
+                color: white; 
+                font-weight: bold; 
+                font-size: 16px; 
+                border-radius: 4px;
+            }
+            QPushButton:disabled { 
+                background-color: #cccccc; 
+                color: #888888; 
+            }
+        """)
+        
         action_layout.addWidget(QLabel("<font color='red'>*</font>检索类目:"))
         action_layout.addWidget(QLabel("<font color='red'>*</font>检索类目:"))
         action_layout.addWidget(self.category_display, 1)
         action_layout.addWidget(self.category_display, 1)
+        action_layout.addLayout(count_layout)
         action_layout.addWidget(self.search_btn)
         action_layout.addWidget(self.search_btn)
         right_layout.addLayout(action_layout)
         right_layout.addLayout(action_layout)
 
 
@@ -178,6 +219,7 @@ class MainWindow(QMainWindow):
             if self.output_base_path:
             if self.output_base_path:
                 full_p = os.path.normpath(os.path.join(self.output_base_path, "选品", self.selected_category_1, f"{self.selected_category_2}.xlsx"))
                 full_p = os.path.normpath(os.path.join(self.output_base_path, "选品", self.selected_category_1, f"{self.selected_category_2}.xlsx"))
                 self.path_display.setText(full_p)
                 self.path_display.setText(full_p)
+                self.search_btn.setEnabled(True) # 仅在路径和类目都选好时启用按钮
 
 
     def select_output_path(self):
     def select_output_path(self):
         p = QFileDialog.getExistingDirectory(self, "选择保存根目录")
         p = QFileDialog.getExistingDirectory(self, "选择保存根目录")
@@ -192,18 +234,17 @@ class MainWindow(QMainWindow):
         file_path = os.path.normpath(os.path.join(target_dir, f"{self.selected_category_2}.xlsx"))
         file_path = os.path.normpath(os.path.join(target_dir, f"{self.selected_category_2}.xlsx"))
         self.current_output_file = file_path # 记录当前文件用于最后打开
         self.current_output_file = file_path # 记录当前文件用于最后打开
         
         
-        # 启动抓取前清理旧文件
-        if os.path.exists(file_path):
-            try: os.remove(file_path)
-            except: pass
+        # 启动抓取前不再删除旧文件,实现断点续爬功能
 
 
         self.search_btn.setEnabled(False)
         self.search_btn.setEnabled(False)
+        self.count_spin.setEnabled(False) # 任务开始后也禁用数量输入
         self.status_label.setText("处理中……")
         self.status_label.setText("处理中……")
         self.log_output.clear()
         self.log_output.clear()
         self.pbar.setValue(0)
         self.pbar.setValue(0)
         headless = not self.show_browser_cb.isChecked()
         headless = not self.show_browser_cb.isChecked()
+        total_count = self.count_spin.value()
         
         
-        self.thread = ScraperThread(self.selected_category_2, file_path, headless)
+        self.thread = ScraperThread(self.selected_category_2, file_path, total_count, headless)
         self.thread.log.connect(self.log_output.append)
         self.thread.log.connect(self.log_output.append)
         self.thread.progress.connect(self.pbar.setValue)
         self.thread.progress.connect(self.pbar.setValue)
         self.thread.finished.connect(self.on_finished)
         self.thread.finished.connect(self.on_finished)
@@ -211,6 +252,7 @@ class MainWindow(QMainWindow):
 
 
     def on_finished(self, err, scraper, duration):
     def on_finished(self, err, scraper, duration):
         self.search_btn.setEnabled(True)
         self.search_btn.setEnabled(True)
+        self.count_spin.setEnabled(True) # 任务结束后恢复数量输入
         if scraper: self.active_scraper = scraper
         if scraper: self.active_scraper = scraper
         
         
         if not err:
         if not err:

+ 25 - 16
src/scraper.py

@@ -111,7 +111,7 @@ class Scraper1688:
         return True
         return True
 
 
     # def search_products_yield(self, keyword, total_count=200):
     # def search_products_yield(self, keyword, total_count=200):
-    def search_products_yield(self, keyword, total_count=20):
+    def search_products_yield(self, keyword, total_count=200, existing_links=None):
         gbk_keyword = urllib.parse.quote(keyword, encoding='gbk')
         gbk_keyword = urllib.parse.quote(keyword, encoding='gbk')
         base_url = f"https://s.1688.com/selloffer/offer_search.htm?keywords={gbk_keyword}&n=y&netType=1%2C11%2C16"
         base_url = f"https://s.1688.com/selloffer/offer_search.htm?keywords={gbk_keyword}&n=y&netType=1%2C11%2C16"
         
         
@@ -119,12 +119,15 @@ class Scraper1688:
         self.driver.get("https://www.1688.com")
         self.driver.get("https://www.1688.com")
         self.check_for_captcha()
         self.check_for_captcha()
 
 
-        all_links = set()
+        all_links = existing_links if existing_links is not None else set()
         page = 1
         page = 1
         consecutive_empty_pages = 0
         consecutive_empty_pages = 0
         
         
-        while len(all_links) < total_count and consecutive_empty_pages < 3:
-            print(f"[*] 正在抓取列表页: 第 {page} 页...")
+        # 记录初始抓取的链接数,用于计算进度
+        initial_count = len(all_links)
+        
+        while len(all_links) < total_count + initial_count and consecutive_empty_pages < 3:
+            print(f"[*] 正在搜索列表页: 第 {page} 页...")
             target_url = f"{base_url}&beginPage={page}&page={page}"
             target_url = f"{base_url}&beginPage={page}&page={page}"
             self.driver.get(target_url)
             self.driver.get(target_url)
             
             
@@ -168,7 +171,7 @@ class Scraper1688:
                     # 详情页抓取后的随机等待
                     # 详情页抓取后的随机等待
                     time.sleep(random.uniform(2, 4))
                     time.sleep(random.uniform(2, 4))
                     
                     
-                    if len(all_links) >= total_count:
+                    if len(all_links) >= total_count + initial_count:
                         break
                         break
             
             
             # 每页结束,将不足 10 条的余数 yield 出去
             # 每页结束,将不足 10 条的余数 yield 出去
@@ -177,8 +180,8 @@ class Scraper1688:
                 page_batch = []
                 page_batch = []
 
 
             page += 1
             page += 1
-            if len(all_links) < total_count:
-                print(f"[*] 累计已处理: {len(all_links)} 条,准备翻下一页...")
+            if len(all_links) < total_count + initial_count:
+                print(f"[*] 累计已处理新链接: {len(all_links) - initial_count} 条,准备翻下一页...")
                 time.sleep(3)
                 time.sleep(3)
 
 
         return list(all_links)
         return list(all_links)
@@ -263,7 +266,7 @@ class Scraper1688:
                            or safe_text(By.CSS_SELECTOR, "div.company-name"),
                            or safe_text(By.CSS_SELECTOR, "div.company-name"),
             }
             }
 
 
-            # --- 核心逻辑:拆分颜色分类 ---
+            # --- 核心逻辑:拆分规格/颜色分类 ---
             sku_props = []
             sku_props = []
             try:
             try:
                 # 尝试多种路径获取 SKU 属性
                 # 尝试多种路径获取 SKU 属性
@@ -272,21 +275,27 @@ class Scraper1688:
                             model.get("sku", {}).get("skuProps", [])
                             model.get("sku", {}).get("skuProps", [])
             except: pass
             except: pass
 
 
-            # 寻找“颜色分类”或类似的属性
-            color_prop = next((p for p in sku_props if p.get("prop") in ["颜色", "颜色分类", "花色"]), None)
+            # 智能寻找主维度:
+            # 1. 优先找包含“颜色”、“分类”、“款式”、“花色”的维度
+            # 2. 如果没有,则取第一个 SKU 维度(例如“净含量”、“规格”等)
+            main_prop = None
+            if sku_props:
+                main_prop = next((p for p in sku_props if any(k in p.get("prop", "") for k in ["颜色", "分类", "款式", "花色"])), None)
+                if not main_prop:
+                    main_prop = sku_props[0]
             
             
-            if color_prop and color_prop.get("value"):
+            if main_prop and main_prop.get("value"):
                 variant_results = []
                 variant_results = []
-                for val in color_prop["value"]:
+                for val in main_prop["value"]:
                     # 只有当该分类确实有名字时才记录
                     # 只有当该分类确实有名字时才记录
-                    c_name = val.get("name")
-                    if c_name:
+                    variant_name = val.get("name")
+                    if variant_name:
                         row = base_data.copy()
                         row = base_data.copy()
-                        row["color"] = c_name
+                        row["color"] = variant_name
                         variant_results.append(row)
                         variant_results.append(row)
                 return variant_results
                 return variant_results
             else:
             else:
-                # 兜底:如果没有 SKU 拆分,则尝试获取单属性颜色
+                # 兜底:如果没有发现规格选择区,则获取单属性颜色
                 base_data["color"] = get_attr("颜色") or get_attr("颜色分类") or ""
                 base_data["color"] = get_attr("颜色") or get_attr("颜色分类") or ""
                 return [base_data]
                 return [base_data]