| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279 |
- import sys
- import os
- import time
- import traceback
- import pandas as pd
- from PyQt6.QtWidgets import (QApplication, QMainWindow, QWidget, QVBoxLayout,
- QHBoxLayout, QLineEdit, QPushButton, QTextEdit,
- QLabel, QFileDialog, QProgressBar, QTreeView, QSplitter, QCheckBox, QSpinBox)
- from PyQt6.QtCore import QThread, pyqtSignal, Qt
- from PyQt6.QtGui import QStandardItemModel, QStandardItem, QIcon
- from src.scraper import Scraper1688
- from src.excel_handler import append_to_template, get_existing_info
- def get_resource_path(relative_path):
- """ 获取资源绝对路径,兼容开发环境和 PyInstaller 打包环境 """
- if hasattr(sys, '_MEIPASS'):
- return os.path.join(sys._MEIPASS, relative_path)
- return os.path.join(os.getcwd(), relative_path)
- class ScraperThread(QThread):
- progress = pyqtSignal(int)
- log = pyqtSignal(str)
- # finished 信号增加耗时参数 (秒)
- finished = pyqtSignal(str, object, float)
- def __init__(self, keyword, output_path, total_count, headless=True):
- super().__init__()
- self.keyword = keyword
- self.output_path = output_path
- self.total_count = total_count
- self.headless = headless
- def run(self):
- scraper = None
- start_time = time.time()
- try:
- # 读取已抓取的链接,实现断点续爬
- existing_links, _ = get_existing_info(self.output_path)
- if existing_links:
- self.log.emit(f"[*] 发现已有记录: {len(existing_links)} 条,将从新记录开始搜索...")
- self.log.emit(f"<b>[*] 任务启动: {self.keyword}</b>")
-
- def status_cb(is_waiting, msg):
- if is_waiting:
- self.log.emit(f"<font color='red' size='5'><b>!!! {msg} !!!</b></font>")
- else:
- self.log.emit(f"<font color='green'><b>[√] {msg}</b></font>")
- scraper = Scraper1688(headless=self.headless, status_callback=status_cb)
-
- # 使用流式生成器抓取
- collected_count = 0
- product_index = 0
-
- for batch_results in scraper.search_products_yield(self.keyword, total_count=self.total_count, existing_links=existing_links):
- # 实时写入 Excel (此时 batch_results 为 10 条或页末余数)
- append_to_template(batch_results, self.output_path, status_callback=status_cb)
-
- # 计算本批次包含的独立商品数量
- unique_links_in_batch = len(set(item.get('link') for item in batch_results if item.get('link')))
- product_index += unique_links_in_batch
- collected_count += len(batch_results)
-
- self.log.emit(f"[+] 解析到第 {product_index} 个商品,新增数据已持久化: {len(batch_results)} 条,本次共计: {collected_count}")
-
- prog = int((product_index / self.total_count) * 100)
- self.progress.emit(min(prog, 100))
-
- duration = time.time() - start_time
- self.log.emit(f"<b>[完成] 任务结束,本次新增抓取 {collected_count} 条数据。</b>")
- self.log.emit(f"<b>[耗时] 处理总时间: {duration:.2f} 秒</b>")
- self.finished.emit("", scraper, duration)
- except Exception as e:
- duration = time.time() - start_time
- err = traceback.format_exc()
- self.log.emit(f"<font color='red'>[错误] {str(e)}</font>")
- self.finished.emit(err, scraper, duration)
- class MainWindow(QMainWindow):
- def __init__(self):
- super().__init__()
- self.selected_category_1 = ""
- self.selected_category_2 = ""
- self.output_base_path = ""
- self.active_scraper = None
- self.initUI()
- self.load_default_categories()
- def initUI(self):
- self.setWindowTitle("1688 产品信息实时抓取工具 v3.0")
- self.setGeometry(100, 100, 1100, 750)
- # 设置窗口图标
- icon_path = get_resource_path("app.ico")
- if os.path.exists(icon_path):
- self.setWindowIcon(QIcon(icon_path))
- central_widget = QWidget()
- self.setCentralWidget(central_widget)
- main_layout = QHBoxLayout(central_widget)
- # 左侧类目树
- left_widget = QWidget()
- left_layout = QVBoxLayout(left_widget)
- self.load_category_btn = QPushButton("选择类目文件")
- self.load_category_btn.clicked.connect(self.select_category_file)
- self.category_tree = QTreeView()
- self.category_tree.setHeaderHidden(True)
- self.category_model = QStandardItemModel()
- self.category_tree.setModel(self.category_model)
- self.category_tree.clicked.connect(self.on_category_clicked)
- left_layout.addWidget(QLabel("<b>商品类目树</b>"))
- left_layout.addWidget(self.load_category_btn)
- left_layout.addWidget(self.category_tree)
- # 右侧操作区
- right_widget = QWidget()
- right_layout = QVBoxLayout(right_widget)
- opt_layout = QHBoxLayout()
- self.show_browser_cb = QCheckBox("显示浏览器界面 (手动过验证时勾选)")
- self.show_browser_cb.setChecked(True)
- opt_layout.addWidget(self.show_browser_cb)
- right_layout.addLayout(opt_layout)
- path_layout = QHBoxLayout()
- self.path_display = QLabel("未选择输出路径")
- self.path_display.setStyleSheet("color: gray; border: 1px solid #ccc; padding: 5px;")
- self.select_path_btn = QPushButton("选择输出目录")
- self.select_path_btn.clicked.connect(self.select_output_path)
- path_layout.addWidget(QLabel("<font color='red'>*</font>输出路径:"))
- path_layout.addWidget(self.path_display, 1)
- path_layout.addWidget(self.select_path_btn)
- right_layout.addLayout(path_layout)
- action_layout = QHBoxLayout()
- self.category_display = QLabel("请选择二级类目")
-
- # 抓取数量配置
- count_layout = QHBoxLayout()
- self.count_spin = QSpinBox()
- self.count_spin.setRange(1, 10000)
- self.count_spin.setValue(200)
- self.count_spin.setFixedWidth(80)
- count_layout.addWidget(QLabel("抓取数量:"))
- count_layout.addWidget(self.count_spin)
-
- self.search_btn = QPushButton("开始抓取")
- self.search_btn.setEnabled(False) # 初始置灰,直到选择类目和路径
- self.search_btn.clicked.connect(self.start_scraping)
- self.search_btn.setMinimumHeight(50)
- self.search_btn.setStyleSheet("""
- QPushButton {
- background-color: #0078d4;
- color: white;
- font-weight: bold;
- font-size: 16px;
- border-radius: 4px;
- }
- QPushButton:disabled {
- background-color: #cccccc;
- color: #888888;
- }
- """)
-
- action_layout.addWidget(QLabel("<font color='red'>*</font>检索类目:"))
- action_layout.addWidget(self.category_display, 1)
- action_layout.addLayout(count_layout)
- action_layout.addWidget(self.search_btn)
- right_layout.addLayout(action_layout)
- self.pbar = QProgressBar()
- self.log_output = QTextEdit()
- self.log_output.setReadOnly(True)
- right_layout.addWidget(QLabel("<b>任务日志:</b>"))
- right_layout.addWidget(self.log_output)
- right_layout.addWidget(self.pbar)
- self.status_label = QLabel("就绪")
- right_layout.addWidget(self.status_label)
- splitter = QSplitter(Qt.Orientation.Horizontal)
- splitter.addWidget(left_widget)
- splitter.addWidget(right_widget)
- splitter.setStretchFactor(1, 3)
- main_layout.addWidget(splitter)
- def load_default_categories(self):
- p = os.path.join('templates', '商品类目.xlsx')
- if os.path.exists(p): self.load_categories(p)
- def select_category_file(self):
- f, _ = QFileDialog.getOpenFileName(self, "选择类目文件", "templates", "Excel (*.xlsx)")
- if f: self.load_categories(f)
- def load_categories(self, f_path):
- try:
- df = pd.read_excel(f_path)
- c1, c2 = df.columns[0], df.columns[1]
- df[c1] = df[c1].ffill()
- self.category_model.clear()
- cats = {}
- for _, row in df.iterrows():
- v1, v2 = str(row[c1]), str(row[c2])
- if v1 not in cats:
- p = QStandardItem(v1); p.setSelectable(False)
- self.category_model.appendRow(p); cats[v1] = p
- child = QStandardItem(v2); child.setData(v1, Qt.ItemDataRole.UserRole)
- cats[v1].appendRow(child)
- self.category_tree.expandAll()
- except: pass
- def on_category_clicked(self, index):
- item = self.category_model.itemFromIndex(index)
- if item.isSelectable():
- self.selected_category_2 = item.text()
- self.selected_category_1 = item.data(Qt.ItemDataRole.UserRole)
- self.update_displays()
- def update_displays(self):
- if self.selected_category_1 and self.selected_category_2:
- self.category_display.setText(f"{self.selected_category_1} / <font color='#0078d4'><b>{self.selected_category_2}</b></font>")
- if self.output_base_path:
- full_p = os.path.normpath(os.path.join(self.output_base_path, "选品", self.selected_category_1, f"{self.selected_category_2}.xlsx"))
- self.path_display.setText(full_p)
- self.search_btn.setEnabled(True) # 仅在路径和类目都选好时启用按钮
- def select_output_path(self):
- p = QFileDialog.getExistingDirectory(self, "选择保存根目录")
- if p: self.output_base_path = p; self.update_displays()
- def start_scraping(self):
- if not self.selected_category_2 or not self.output_base_path:
- self.log_output.append("<font color='red'>[错误] 请选择类目和输出路径</font>")
- return
-
- target_dir = os.path.join(self.output_base_path, "选品", self.selected_category_1)
- file_path = os.path.normpath(os.path.join(target_dir, f"{self.selected_category_2}.xlsx"))
- self.current_output_file = file_path # 记录当前文件用于最后打开
-
- # 启动抓取前不再删除旧文件,实现断点续爬功能
- self.search_btn.setEnabled(False)
- self.count_spin.setEnabled(False) # 任务开始后也禁用数量输入
- self.status_label.setText("处理中……")
- self.log_output.clear()
- self.pbar.setValue(0)
- headless = not self.show_browser_cb.isChecked()
- total_count = self.count_spin.value()
-
- self.thread = ScraperThread(self.selected_category_2, file_path, total_count, headless)
- self.thread.log.connect(self.log_output.append)
- self.thread.progress.connect(self.pbar.setValue)
- self.thread.finished.connect(self.on_finished)
- self.thread.start()
- def on_finished(self, err, scraper, duration):
- self.search_btn.setEnabled(True)
- self.count_spin.setEnabled(True) # 任务结束后恢复数量输入
- if scraper: self.active_scraper = scraper
-
- if not err:
- self.status_label.setText("任务完成")
- # 自动打开目标文件
- if hasattr(self, 'current_output_file') and os.path.exists(self.current_output_file):
- try:
- os.startfile(self.current_output_file)
- self.log_output.append(f"<font color='blue'>[系统] 已自动打开结果文件</font>")
- except Exception as e:
- self.log_output.append(f"<font color='orange'>[警告] 无法自动打开文件: {e}</font>")
- else:
- self.status_label.setText("异常终止")
- if __name__ == "__main__":
- app = QApplication(sys.argv)
- win = MainWindow()
- win.show()
- sys.exit(app.exec())
|