|
|
@@ -0,0 +1,208 @@
|
|
|
+import sys
|
|
|
+import os
|
|
|
+import time
|
|
|
+import traceback
|
|
|
+import pandas as pd
|
|
|
+from PyQt6.QtWidgets import (QApplication, QMainWindow, QWidget, QVBoxLayout,
|
|
|
+ QHBoxLayout, QLineEdit, QPushButton, QTextEdit,
|
|
|
+ QLabel, QFileDialog, QProgressBar, QTreeView, QSplitter, QCheckBox)
|
|
|
+from PyQt6.QtCore import QThread, pyqtSignal, Qt
|
|
|
+from PyQt6.QtGui import QStandardItemModel, QStandardItem
|
|
|
+
|
|
|
+from src.scraper import Scraper1688
|
|
|
+from src.excel_handler import append_to_template
|
|
|
+
|
|
|
+class ScraperThread(QThread):
|
|
|
+ progress = pyqtSignal(int)
|
|
|
+ log = pyqtSignal(str)
|
|
|
+ # finished 信号改为只传错误信息和 scraper 对象,数据已实时写入
|
|
|
+ finished = pyqtSignal(str, object)
|
|
|
+
|
|
|
+ def __init__(self, keyword, output_path, headless=True):
|
|
|
+ super().__init__()
|
|
|
+ self.keyword = keyword
|
|
|
+ self.output_path = output_path
|
|
|
+ self.headless = headless
|
|
|
+
|
|
|
+ def run(self):
|
|
|
+ scraper = None
|
|
|
+ try:
|
|
|
+ self.log.emit(f"<b>[*] 任务启动: {self.keyword}</b>")
|
|
|
+ scraper = Scraper1688(headless=self.headless)
|
|
|
+
|
|
|
+ # 使用流式生成器抓取
|
|
|
+ total_target = 200
|
|
|
+ collected_count = 0
|
|
|
+
|
|
|
+ for page_results in scraper.search_products_yield(self.keyword, total_count=total_target):
|
|
|
+ # 实时写入 Excel
|
|
|
+ append_to_template(page_results, self.output_path)
|
|
|
+
|
|
|
+ collected_count += len(page_results)
|
|
|
+ self.log.emit(f"[+] 批次写入成功: {len(page_results)} 条,当前总计: {collected_count}")
|
|
|
+
|
|
|
+ prog = int((collected_count / total_target) * 100)
|
|
|
+ self.progress.emit(prog)
|
|
|
+
|
|
|
+ self.log.emit(f"<b>[完成] 任务结束,共抓取 {collected_count} 条数据。</b>")
|
|
|
+ self.finished.emit("", scraper)
|
|
|
+ except Exception as e:
|
|
|
+ err = traceback.format_exc()
|
|
|
+ print(err)
|
|
|
+ self.log.emit(f"<font color='red'>[错误] {str(e)}</font>")
|
|
|
+ self.finished.emit(err, scraper)
|
|
|
+
|
|
|
+class MainWindow(QMainWindow):
|
|
|
+ def __init__(self):
|
|
|
+ super().__init__()
|
|
|
+ self.selected_category_1 = ""
|
|
|
+ self.selected_category_2 = ""
|
|
|
+ self.output_base_path = ""
|
|
|
+ self.active_scraper = None
|
|
|
+ self.initUI()
|
|
|
+ self.load_default_categories()
|
|
|
+
|
|
|
+ def initUI(self):
|
|
|
+ self.setWindowTitle("1688 产品信息实时抓取工具 v3.0")
|
|
|
+ self.setGeometry(100, 100, 1100, 750)
|
|
|
+ central_widget = QWidget()
|
|
|
+ self.setCentralWidget(central_widget)
|
|
|
+ main_layout = QHBoxLayout(central_widget)
|
|
|
+
|
|
|
+ # 左侧类目树
|
|
|
+ left_widget = QWidget()
|
|
|
+ left_layout = QVBoxLayout(left_widget)
|
|
|
+ self.load_category_btn = QPushButton("选择类目文件")
|
|
|
+ self.load_category_btn.clicked.connect(self.select_category_file)
|
|
|
+ self.category_tree = QTreeView()
|
|
|
+ self.category_tree.setHeaderHidden(True)
|
|
|
+ self.category_model = QStandardItemModel()
|
|
|
+ self.category_tree.setModel(self.category_model)
|
|
|
+ self.category_tree.clicked.connect(self.on_category_clicked)
|
|
|
+ left_layout.addWidget(QLabel("<b>商品类目树</b>"))
|
|
|
+ left_layout.addWidget(self.load_category_btn)
|
|
|
+ left_layout.addWidget(self.category_tree)
|
|
|
+
|
|
|
+ # 右侧操作区
|
|
|
+ right_widget = QWidget()
|
|
|
+ right_layout = QVBoxLayout(right_widget)
|
|
|
+
|
|
|
+ opt_layout = QHBoxLayout()
|
|
|
+ self.show_browser_cb = QCheckBox("显示浏览器界面 (手动过验证时勾选)")
|
|
|
+ self.show_browser_cb.setChecked(True)
|
|
|
+ opt_layout.addWidget(self.show_browser_cb)
|
|
|
+ right_layout.addLayout(opt_layout)
|
|
|
+
|
|
|
+ path_layout = QHBoxLayout()
|
|
|
+ self.path_display = QLabel("未选择输出路径")
|
|
|
+ self.path_display.setStyleSheet("color: gray; border: 1px solid #ccc; padding: 5px;")
|
|
|
+ self.select_path_btn = QPushButton("选择输出目录")
|
|
|
+ self.select_path_btn.clicked.connect(self.select_output_path)
|
|
|
+ path_layout.addWidget(QLabel("<font color='red'>*</font>输出路径:"))
|
|
|
+ path_layout.addWidget(self.path_display, 1)
|
|
|
+ path_layout.addWidget(self.select_path_btn)
|
|
|
+ right_layout.addLayout(path_layout)
|
|
|
+
|
|
|
+ action_layout = QHBoxLayout()
|
|
|
+ self.category_display = QLabel("请选择二级类目")
|
|
|
+ self.search_btn = QPushButton("开始抓取")
|
|
|
+ self.search_btn.clicked.connect(self.start_scraping)
|
|
|
+ self.search_btn.setMinimumHeight(50)
|
|
|
+ self.search_btn.setStyleSheet("QPushButton { background-color: #0078d4; color: white; font-weight: bold; font-size: 16px; }")
|
|
|
+ action_layout.addWidget(QLabel("<font color='red'>*</font>检索类目:"))
|
|
|
+ action_layout.addWidget(self.category_display, 1)
|
|
|
+ action_layout.addWidget(self.search_btn)
|
|
|
+ right_layout.addLayout(action_layout)
|
|
|
+
|
|
|
+ self.pbar = QProgressBar()
|
|
|
+ self.log_output = QTextEdit()
|
|
|
+ self.log_output.setReadOnly(True)
|
|
|
+ right_layout.addWidget(QLabel("<b>任务日志:</b>"))
|
|
|
+ right_layout.addWidget(self.log_output)
|
|
|
+ right_layout.addWidget(self.pbar)
|
|
|
+ self.status_label = QLabel("就绪")
|
|
|
+ right_layout.addWidget(self.status_label)
|
|
|
+
|
|
|
+ splitter = QSplitter(Qt.Orientation.Horizontal)
|
|
|
+ splitter.addWidget(left_widget)
|
|
|
+ splitter.addWidget(right_widget)
|
|
|
+ splitter.setStretchFactor(1, 3)
|
|
|
+ main_layout.addWidget(splitter)
|
|
|
+
|
|
|
+ def load_default_categories(self):
|
|
|
+ p = os.path.join('templates', '商品类目.xlsx')
|
|
|
+ if os.path.exists(p): self.load_categories(p)
|
|
|
+
|
|
|
+ def select_category_file(self):
|
|
|
+ f, _ = QFileDialog.getOpenFileName(self, "选择类目文件", "templates", "Excel (*.xlsx)")
|
|
|
+ if f: self.load_categories(f)
|
|
|
+
|
|
|
+ def load_categories(self, f_path):
|
|
|
+ try:
|
|
|
+ df = pd.read_excel(f_path)
|
|
|
+ c1, c2 = df.columns[0], df.columns[1]
|
|
|
+ df[c1] = df[c1].ffill()
|
|
|
+ self.category_model.clear()
|
|
|
+ cats = {}
|
|
|
+ for _, row in df.iterrows():
|
|
|
+ v1, v2 = str(row[c1]), str(row[c2])
|
|
|
+ if v1 not in cats:
|
|
|
+ p = QStandardItem(v1); p.setSelectable(False)
|
|
|
+ self.category_model.appendRow(p); cats[v1] = p
|
|
|
+ child = QStandardItem(v2); child.setData(v1, Qt.ItemDataRole.UserRole)
|
|
|
+ cats[v1].appendRow(child)
|
|
|
+ self.category_tree.expandAll()
|
|
|
+ except: pass
|
|
|
+
|
|
|
+ def on_category_clicked(self, index):
|
|
|
+ item = self.category_model.itemFromIndex(index)
|
|
|
+ if item.isSelectable():
|
|
|
+ self.selected_category_2 = item.text()
|
|
|
+ self.selected_category_1 = item.data(Qt.ItemDataRole.UserRole)
|
|
|
+ self.update_displays()
|
|
|
+
|
|
|
+ def update_displays(self):
|
|
|
+ if self.selected_category_1 and self.selected_category_2:
|
|
|
+ self.category_display.setText(f"{self.selected_category_1} / <font color='#0078d4'><b>{self.selected_category_2}</b></font>")
|
|
|
+ if self.output_base_path:
|
|
|
+ full_p = os.path.normpath(os.path.join(self.output_base_path, "选品", self.selected_category_1, f"{self.selected_category_2}.xlsx"))
|
|
|
+ self.path_display.setText(full_p)
|
|
|
+
|
|
|
+ def select_output_path(self):
|
|
|
+ p = QFileDialog.getExistingDirectory(self, "选择保存根目录")
|
|
|
+ if p: self.output_base_path = p; self.update_displays()
|
|
|
+
|
|
|
+ def start_scraping(self):
|
|
|
+ if not self.selected_category_2 or not self.output_base_path:
|
|
|
+ self.log_output.append("<font color='red'>[错误] 请选择类目和输出路径</font>")
|
|
|
+ return
|
|
|
+
|
|
|
+ target_dir = os.path.join(self.output_base_path, "选品", self.selected_category_1)
|
|
|
+ file_path = os.path.join(target_dir, f"{self.selected_category_2}.xlsx")
|
|
|
+
|
|
|
+ # 启动抓取前清理旧文件,确保从模板重新开始
|
|
|
+ if os.path.exists(file_path):
|
|
|
+ try: os.remove(file_path)
|
|
|
+ except: pass
|
|
|
+
|
|
|
+ self.search_btn.setEnabled(False)
|
|
|
+ self.log_output.clear()
|
|
|
+ self.pbar.setValue(0)
|
|
|
+ headless = not self.show_browser_cb.isChecked()
|
|
|
+
|
|
|
+ self.thread = ScraperThread(self.selected_category_2, file_path, headless)
|
|
|
+ self.thread.log.connect(self.log_output.append)
|
|
|
+ self.thread.progress.connect(self.pbar.setValue)
|
|
|
+ self.thread.finished.connect(self.on_finished)
|
|
|
+ self.thread.start()
|
|
|
+
|
|
|
+ def on_finished(self, err, scraper):
|
|
|
+ self.search_btn.setEnabled(True)
|
|
|
+ if scraper: self.active_scraper = scraper
|
|
|
+ self.status_label.setText("任务完成" if not err else "异常终止")
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ app = QApplication(sys.argv)
|
|
|
+ win = MainWindow()
|
|
|
+ win.show()
|
|
|
+ sys.exit(app.exec())
|