|
|
@@ -5,12 +5,18 @@ import traceback
|
|
|
import pandas as pd
|
|
|
from PyQt6.QtWidgets import (QApplication, QMainWindow, QWidget, QVBoxLayout,
|
|
|
QHBoxLayout, QLineEdit, QPushButton, QTextEdit,
|
|
|
- QLabel, QFileDialog, QProgressBar, QTreeView, QSplitter, QCheckBox)
|
|
|
+ QLabel, QFileDialog, QProgressBar, QTreeView, QSplitter, QCheckBox, QSpinBox)
|
|
|
from PyQt6.QtCore import QThread, pyqtSignal, Qt
|
|
|
-from PyQt6.QtGui import QStandardItemModel, QStandardItem
|
|
|
+from PyQt6.QtGui import QStandardItemModel, QStandardItem, QIcon
|
|
|
|
|
|
from src.scraper import Scraper1688
|
|
|
-from src.excel_handler import append_to_template
|
|
|
+from src.excel_handler import append_to_template, get_existing_info
|
|
|
+
|
|
|
+def get_resource_path(relative_path):
|
|
|
+ """ 获取资源绝对路径,兼容开发环境和 PyInstaller 打包环境 """
|
|
|
+ if hasattr(sys, '_MEIPASS'):
|
|
|
+ return os.path.join(sys._MEIPASS, relative_path)
|
|
|
+ return os.path.join(os.getcwd(), relative_path)
|
|
|
|
|
|
class ScraperThread(QThread):
|
|
|
progress = pyqtSignal(int)
|
|
|
@@ -18,16 +24,22 @@ class ScraperThread(QThread):
|
|
|
# finished 信号增加耗时参数 (秒)
|
|
|
finished = pyqtSignal(str, object, float)
|
|
|
|
|
|
- def __init__(self, keyword, output_path, headless=True):
|
|
|
+ def __init__(self, keyword, output_path, total_count, headless=True):
|
|
|
super().__init__()
|
|
|
self.keyword = keyword
|
|
|
self.output_path = output_path
|
|
|
+ self.total_count = total_count
|
|
|
self.headless = headless
|
|
|
|
|
|
def run(self):
|
|
|
scraper = None
|
|
|
start_time = time.time()
|
|
|
try:
|
|
|
+ # 读取已抓取的链接,实现断点续爬
|
|
|
+ existing_links, _ = get_existing_info(self.output_path)
|
|
|
+ if existing_links:
|
|
|
+ self.log.emit(f"[*] 发现已有记录: {len(existing_links)} 条,将从新记录开始搜索...")
|
|
|
+
|
|
|
self.log.emit(f"<b>[*] 任务启动: {self.keyword}</b>")
|
|
|
|
|
|
def status_cb(is_waiting, msg):
|
|
|
@@ -39,22 +51,20 @@ class ScraperThread(QThread):
|
|
|
scraper = Scraper1688(headless=self.headless, status_callback=status_cb)
|
|
|
|
|
|
# 使用流式生成器抓取
|
|
|
- total_target = 20
|
|
|
- # total_target = 200
|
|
|
collected_count = 0
|
|
|
|
|
|
- for batch_results in scraper.search_products_yield(self.keyword, total_count=total_target):
|
|
|
+ for batch_results in scraper.search_products_yield(self.keyword, total_count=self.total_count, existing_links=existing_links):
|
|
|
# 实时写入 Excel (此时 batch_results 为 10 条或页末余数)
|
|
|
- append_to_template(batch_results, self.output_path)
|
|
|
+ append_to_template(batch_results, self.output_path, status_callback=status_cb)
|
|
|
|
|
|
collected_count += len(batch_results)
|
|
|
- self.log.emit(f"[+] 数据已持久化: {len(batch_results)} 条,当前总计: {collected_count}")
|
|
|
+ self.log.emit(f"[+] 新增数据已持久化: {len(batch_results)} 条,本次共计: {collected_count}")
|
|
|
|
|
|
- prog = int((collected_count / total_target) * 100)
|
|
|
+ prog = int((collected_count / self.total_count) * 100)
|
|
|
self.progress.emit(min(prog, 100))
|
|
|
|
|
|
duration = time.time() - start_time
|
|
|
- self.log.emit(f"<b>[完成] 任务结束,共抓取 {collected_count} 条数据。</b>")
|
|
|
+ self.log.emit(f"<b>[完成] 任务结束,本次新增抓取 {collected_count} 条数据。</b>")
|
|
|
self.log.emit(f"<b>[耗时] 处理总时间: {duration:.2f} 秒</b>")
|
|
|
self.finished.emit("", scraper, duration)
|
|
|
except Exception as e:
|
|
|
@@ -76,6 +86,12 @@ class MainWindow(QMainWindow):
|
|
|
def initUI(self):
|
|
|
self.setWindowTitle("1688 产品信息实时抓取工具 v3.0")
|
|
|
self.setGeometry(100, 100, 1100, 750)
|
|
|
+
|
|
|
+ # 设置窗口图标
|
|
|
+ icon_path = get_resource_path("app.ico")
|
|
|
+ if os.path.exists(icon_path):
|
|
|
+ self.setWindowIcon(QIcon(icon_path))
|
|
|
+
|
|
|
central_widget = QWidget()
|
|
|
self.setCentralWidget(central_widget)
|
|
|
main_layout = QHBoxLayout(central_widget)
|
|
|
@@ -116,12 +132,37 @@ class MainWindow(QMainWindow):
|
|
|
|
|
|
action_layout = QHBoxLayout()
|
|
|
self.category_display = QLabel("请选择二级类目")
|
|
|
+
|
|
|
+ # 抓取数量配置
|
|
|
+ count_layout = QHBoxLayout()
|
|
|
+ self.count_spin = QSpinBox()
|
|
|
+ self.count_spin.setRange(1, 10000)
|
|
|
+ self.count_spin.setValue(200)
|
|
|
+ self.count_spin.setFixedWidth(80)
|
|
|
+ count_layout.addWidget(QLabel("抓取数量:"))
|
|
|
+ count_layout.addWidget(self.count_spin)
|
|
|
+
|
|
|
self.search_btn = QPushButton("开始抓取")
|
|
|
+ self.search_btn.setEnabled(False) # 初始置灰,直到选择类目和路径
|
|
|
self.search_btn.clicked.connect(self.start_scraping)
|
|
|
self.search_btn.setMinimumHeight(50)
|
|
|
- self.search_btn.setStyleSheet("QPushButton { background-color: #0078d4; color: white; font-weight: bold; font-size: 16px; }")
|
|
|
+ self.search_btn.setStyleSheet("""
|
|
|
+ QPushButton {
|
|
|
+ background-color: #0078d4;
|
|
|
+ color: white;
|
|
|
+ font-weight: bold;
|
|
|
+ font-size: 16px;
|
|
|
+ border-radius: 4px;
|
|
|
+ }
|
|
|
+ QPushButton:disabled {
|
|
|
+ background-color: #cccccc;
|
|
|
+ color: #888888;
|
|
|
+ }
|
|
|
+ """)
|
|
|
+
|
|
|
action_layout.addWidget(QLabel("<font color='red'>*</font>检索类目:"))
|
|
|
action_layout.addWidget(self.category_display, 1)
|
|
|
+ action_layout.addLayout(count_layout)
|
|
|
action_layout.addWidget(self.search_btn)
|
|
|
right_layout.addLayout(action_layout)
|
|
|
|
|
|
@@ -178,6 +219,7 @@ class MainWindow(QMainWindow):
|
|
|
if self.output_base_path:
|
|
|
full_p = os.path.normpath(os.path.join(self.output_base_path, "选品", self.selected_category_1, f"{self.selected_category_2}.xlsx"))
|
|
|
self.path_display.setText(full_p)
|
|
|
+ self.search_btn.setEnabled(True) # 仅在路径和类目都选好时启用按钮
|
|
|
|
|
|
def select_output_path(self):
|
|
|
p = QFileDialog.getExistingDirectory(self, "选择保存根目录")
|
|
|
@@ -192,18 +234,17 @@ class MainWindow(QMainWindow):
|
|
|
file_path = os.path.normpath(os.path.join(target_dir, f"{self.selected_category_2}.xlsx"))
|
|
|
self.current_output_file = file_path # 记录当前文件用于最后打开
|
|
|
|
|
|
- # 启动抓取前清理旧文件
|
|
|
- if os.path.exists(file_path):
|
|
|
- try: os.remove(file_path)
|
|
|
- except: pass
|
|
|
+ # 启动抓取前不再删除旧文件,实现断点续爬功能
|
|
|
|
|
|
self.search_btn.setEnabled(False)
|
|
|
+ self.count_spin.setEnabled(False) # 任务开始后也禁用数量输入
|
|
|
self.status_label.setText("处理中……")
|
|
|
self.log_output.clear()
|
|
|
self.pbar.setValue(0)
|
|
|
headless = not self.show_browser_cb.isChecked()
|
|
|
+ total_count = self.count_spin.value()
|
|
|
|
|
|
- self.thread = ScraperThread(self.selected_category_2, file_path, headless)
|
|
|
+ self.thread = ScraperThread(self.selected_category_2, file_path, total_count, headless)
|
|
|
self.thread.log.connect(self.log_output.append)
|
|
|
self.thread.progress.connect(self.pbar.setValue)
|
|
|
self.thread.finished.connect(self.on_finished)
|
|
|
@@ -211,6 +252,7 @@ class MainWindow(QMainWindow):
|
|
|
|
|
|
def on_finished(self, err, scraper, duration):
|
|
|
self.search_btn.setEnabled(True)
|
|
|
+ self.count_spin.setEnabled(True) # 任务结束后恢复数量输入
|
|
|
if scraper: self.active_scraper = scraper
|
|
|
|
|
|
if not err:
|