import sys
import os
import time
import traceback
import pandas as pd
from PyQt6.QtWidgets import (QApplication, QMainWindow, QWidget, QVBoxLayout,
QHBoxLayout, QLineEdit, QPushButton, QTextEdit,
QLabel, QFileDialog, QProgressBar, QTreeView, QSplitter, QCheckBox, QSpinBox)
from PyQt6.QtCore import QThread, pyqtSignal, Qt
from PyQt6.QtGui import QStandardItemModel, QStandardItem, QIcon
from src.scraper import Scraper1688
from src.excel_handler import append_to_template, get_existing_info
def get_resource_path(relative_path):
""" 获取资源绝对路径,兼容开发环境和 PyInstaller 打包环境 """
if hasattr(sys, '_MEIPASS'):
return os.path.join(sys._MEIPASS, relative_path)
return os.path.join(os.getcwd(), relative_path)
class ScraperThread(QThread):
progress = pyqtSignal(int)
log = pyqtSignal(str)
# finished 信号增加耗时参数 (秒)
finished = pyqtSignal(str, object, float)
def __init__(self, keyword, output_path, total_count, headless=True):
super().__init__()
self.keyword = keyword
self.output_path = output_path
self.total_count = total_count
self.headless = headless
def run(self):
scraper = None
start_time = time.time()
try:
# 读取已抓取的链接,实现断点续爬
existing_links, _ = get_existing_info(self.output_path)
if existing_links:
self.log.emit(f"[*] 发现已有记录: {len(existing_links)} 条,将从新记录开始搜索...")
self.log.emit(f"[*] 任务启动: {self.keyword}")
def status_cb(is_waiting, msg):
if is_waiting:
self.log.emit(f"!!! {msg} !!!")
else:
self.log.emit(f"[√] {msg}")
scraper = Scraper1688(headless=self.headless, status_callback=status_cb)
# 使用流式生成器抓取
collected_count = 0
for batch_results in scraper.search_products_yield(self.keyword, total_count=self.total_count, existing_links=existing_links):
# 实时写入 Excel (此时 batch_results 为 10 条或页末余数)
append_to_template(batch_results, self.output_path, status_callback=status_cb)
collected_count += len(batch_results)
self.log.emit(f"[+] 新增数据已持久化: {len(batch_results)} 条,本次共计: {collected_count}")
prog = int((collected_count / self.total_count) * 100)
self.progress.emit(min(prog, 100))
duration = time.time() - start_time
self.log.emit(f"[完成] 任务结束,本次新增抓取 {collected_count} 条数据。")
self.log.emit(f"[耗时] 处理总时间: {duration:.2f} 秒")
self.finished.emit("", scraper, duration)
except Exception as e:
duration = time.time() - start_time
err = traceback.format_exc()
self.log.emit(f"[错误] {str(e)}")
self.finished.emit(err, scraper, duration)
class MainWindow(QMainWindow):
def __init__(self):
super().__init__()
self.selected_category_1 = ""
self.selected_category_2 = ""
self.output_base_path = ""
self.active_scraper = None
self.initUI()
self.load_default_categories()
def initUI(self):
self.setWindowTitle("1688 产品信息实时抓取工具 v3.0")
self.setGeometry(100, 100, 1100, 750)
# 设置窗口图标
icon_path = get_resource_path("app.ico")
if os.path.exists(icon_path):
self.setWindowIcon(QIcon(icon_path))
central_widget = QWidget()
self.setCentralWidget(central_widget)
main_layout = QHBoxLayout(central_widget)
# 左侧类目树
left_widget = QWidget()
left_layout = QVBoxLayout(left_widget)
self.load_category_btn = QPushButton("选择类目文件")
self.load_category_btn.clicked.connect(self.select_category_file)
self.category_tree = QTreeView()
self.category_tree.setHeaderHidden(True)
self.category_model = QStandardItemModel()
self.category_tree.setModel(self.category_model)
self.category_tree.clicked.connect(self.on_category_clicked)
left_layout.addWidget(QLabel("商品类目树"))
left_layout.addWidget(self.load_category_btn)
left_layout.addWidget(self.category_tree)
# 右侧操作区
right_widget = QWidget()
right_layout = QVBoxLayout(right_widget)
opt_layout = QHBoxLayout()
self.show_browser_cb = QCheckBox("显示浏览器界面 (手动过验证时勾选)")
self.show_browser_cb.setChecked(True)
opt_layout.addWidget(self.show_browser_cb)
right_layout.addLayout(opt_layout)
path_layout = QHBoxLayout()
self.path_display = QLabel("未选择输出路径")
self.path_display.setStyleSheet("color: gray; border: 1px solid #ccc; padding: 5px;")
self.select_path_btn = QPushButton("选择输出目录")
self.select_path_btn.clicked.connect(self.select_output_path)
path_layout.addWidget(QLabel("*输出路径:"))
path_layout.addWidget(self.path_display, 1)
path_layout.addWidget(self.select_path_btn)
right_layout.addLayout(path_layout)
action_layout = QHBoxLayout()
self.category_display = QLabel("请选择二级类目")
# 抓取数量配置
count_layout = QHBoxLayout()
self.count_spin = QSpinBox()
self.count_spin.setRange(1, 10000)
self.count_spin.setValue(200)
self.count_spin.setFixedWidth(80)
count_layout.addWidget(QLabel("抓取数量:"))
count_layout.addWidget(self.count_spin)
self.search_btn = QPushButton("开始抓取")
self.search_btn.setEnabled(False) # 初始置灰,直到选择类目和路径
self.search_btn.clicked.connect(self.start_scraping)
self.search_btn.setMinimumHeight(50)
self.search_btn.setStyleSheet("""
QPushButton {
background-color: #0078d4;
color: white;
font-weight: bold;
font-size: 16px;
border-radius: 4px;
}
QPushButton:disabled {
background-color: #cccccc;
color: #888888;
}
""")
action_layout.addWidget(QLabel("*检索类目:"))
action_layout.addWidget(self.category_display, 1)
action_layout.addLayout(count_layout)
action_layout.addWidget(self.search_btn)
right_layout.addLayout(action_layout)
self.pbar = QProgressBar()
self.log_output = QTextEdit()
self.log_output.setReadOnly(True)
right_layout.addWidget(QLabel("任务日志:"))
right_layout.addWidget(self.log_output)
right_layout.addWidget(self.pbar)
self.status_label = QLabel("就绪")
right_layout.addWidget(self.status_label)
splitter = QSplitter(Qt.Orientation.Horizontal)
splitter.addWidget(left_widget)
splitter.addWidget(right_widget)
splitter.setStretchFactor(1, 3)
main_layout.addWidget(splitter)
def load_default_categories(self):
p = os.path.join('templates', '商品类目.xlsx')
if os.path.exists(p): self.load_categories(p)
def select_category_file(self):
f, _ = QFileDialog.getOpenFileName(self, "选择类目文件", "templates", "Excel (*.xlsx)")
if f: self.load_categories(f)
def load_categories(self, f_path):
try:
df = pd.read_excel(f_path)
c1, c2 = df.columns[0], df.columns[1]
df[c1] = df[c1].ffill()
self.category_model.clear()
cats = {}
for _, row in df.iterrows():
v1, v2 = str(row[c1]), str(row[c2])
if v1 not in cats:
p = QStandardItem(v1); p.setSelectable(False)
self.category_model.appendRow(p); cats[v1] = p
child = QStandardItem(v2); child.setData(v1, Qt.ItemDataRole.UserRole)
cats[v1].appendRow(child)
self.category_tree.expandAll()
except: pass
def on_category_clicked(self, index):
item = self.category_model.itemFromIndex(index)
if item.isSelectable():
self.selected_category_2 = item.text()
self.selected_category_1 = item.data(Qt.ItemDataRole.UserRole)
self.update_displays()
def update_displays(self):
if self.selected_category_1 and self.selected_category_2:
self.category_display.setText(f"{self.selected_category_1} / {self.selected_category_2}")
if self.output_base_path:
full_p = os.path.normpath(os.path.join(self.output_base_path, "选品", self.selected_category_1, f"{self.selected_category_2}.xlsx"))
self.path_display.setText(full_p)
self.search_btn.setEnabled(True) # 仅在路径和类目都选好时启用按钮
def select_output_path(self):
p = QFileDialog.getExistingDirectory(self, "选择保存根目录")
if p: self.output_base_path = p; self.update_displays()
def start_scraping(self):
if not self.selected_category_2 or not self.output_base_path:
self.log_output.append("[错误] 请选择类目和输出路径")
return
target_dir = os.path.join(self.output_base_path, "选品", self.selected_category_1)
file_path = os.path.normpath(os.path.join(target_dir, f"{self.selected_category_2}.xlsx"))
self.current_output_file = file_path # 记录当前文件用于最后打开
# 启动抓取前不再删除旧文件,实现断点续爬功能
self.search_btn.setEnabled(False)
self.count_spin.setEnabled(False) # 任务开始后也禁用数量输入
self.status_label.setText("处理中……")
self.log_output.clear()
self.pbar.setValue(0)
headless = not self.show_browser_cb.isChecked()
total_count = self.count_spin.value()
self.thread = ScraperThread(self.selected_category_2, file_path, total_count, headless)
self.thread.log.connect(self.log_output.append)
self.thread.progress.connect(self.pbar.setValue)
self.thread.finished.connect(self.on_finished)
self.thread.start()
def on_finished(self, err, scraper, duration):
self.search_btn.setEnabled(True)
self.count_spin.setEnabled(True) # 任务结束后恢复数量输入
if scraper: self.active_scraper = scraper
if not err:
self.status_label.setText("任务完成")
# 自动打开目标文件
if hasattr(self, 'current_output_file') and os.path.exists(self.current_output_file):
try:
os.startfile(self.current_output_file)
self.log_output.append(f"[系统] 已自动打开结果文件")
except Exception as e:
self.log_output.append(f"[警告] 无法自动打开文件: {e}")
else:
self.status_label.setText("异常终止")
if __name__ == "__main__":
app = QApplication(sys.argv)
win = MainWindow()
win.show()
sys.exit(app.exec())