gui.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279
  1. import sys
  2. import os
  3. import time
  4. import traceback
  5. import pandas as pd
  6. from PyQt6.QtWidgets import (QApplication, QMainWindow, QWidget, QVBoxLayout,
  7. QHBoxLayout, QLineEdit, QPushButton, QTextEdit,
  8. QLabel, QFileDialog, QProgressBar, QTreeView, QSplitter, QCheckBox, QSpinBox)
  9. from PyQt6.QtCore import QThread, pyqtSignal, Qt
  10. from PyQt6.QtGui import QStandardItemModel, QStandardItem, QIcon
  11. from src.scraper import Scraper1688
  12. from src.excel_handler import append_to_template, get_existing_info
  13. def get_resource_path(relative_path):
  14. """ 获取资源绝对路径,兼容开发环境和 PyInstaller 打包环境 """
  15. if hasattr(sys, '_MEIPASS'):
  16. return os.path.join(sys._MEIPASS, relative_path)
  17. return os.path.join(os.getcwd(), relative_path)
  18. class ScraperThread(QThread):
  19. progress = pyqtSignal(int)
  20. log = pyqtSignal(str)
  21. # finished 信号增加耗时参数 (秒)
  22. finished = pyqtSignal(str, object, float)
  23. def __init__(self, keyword, output_path, total_count, headless=True):
  24. super().__init__()
  25. self.keyword = keyword
  26. self.output_path = output_path
  27. self.total_count = total_count
  28. self.headless = headless
  29. def run(self):
  30. scraper = None
  31. start_time = time.time()
  32. try:
  33. # 读取已抓取的链接,实现断点续爬
  34. existing_links, _ = get_existing_info(self.output_path)
  35. if existing_links:
  36. self.log.emit(f"[*] 发现已有记录: {len(existing_links)} 条,将从新记录开始搜索...")
  37. self.log.emit(f"<b>[*] 任务启动: {self.keyword}</b>")
  38. def status_cb(is_waiting, msg):
  39. if is_waiting:
  40. self.log.emit(f"<font color='red' size='5'><b>!!! {msg} !!!</b></font>")
  41. else:
  42. self.log.emit(f"<font color='green'><b>[√] {msg}</b></font>")
  43. scraper = Scraper1688(headless=self.headless, status_callback=status_cb)
  44. # 使用流式生成器抓取
  45. collected_count = 0
  46. product_index = 0
  47. for batch_results in scraper.search_products_yield(self.keyword, total_count=self.total_count, existing_links=existing_links):
  48. # 实时写入 Excel (此时 batch_results 为 10 条或页末余数)
  49. append_to_template(batch_results, self.output_path, status_callback=status_cb)
  50. # 计算本批次包含的独立商品数量
  51. unique_links_in_batch = len(set(item.get('link') for item in batch_results if item.get('link')))
  52. product_index += unique_links_in_batch
  53. collected_count += len(batch_results)
  54. self.log.emit(f"[+] 解析到第 {product_index} 个商品,新增数据已持久化: {len(batch_results)} 条,本次共计: {collected_count}")
  55. prog = int((product_index / self.total_count) * 100)
  56. self.progress.emit(min(prog, 100))
  57. duration = time.time() - start_time
  58. self.log.emit(f"<b>[完成] 任务结束,本次新增抓取 {collected_count} 条数据。</b>")
  59. self.log.emit(f"<b>[耗时] 处理总时间: {duration:.2f} 秒</b>")
  60. self.finished.emit("", scraper, duration)
  61. except Exception as e:
  62. duration = time.time() - start_time
  63. err = traceback.format_exc()
  64. self.log.emit(f"<font color='red'>[错误] {str(e)}</font>")
  65. self.finished.emit(err, scraper, duration)
  66. class MainWindow(QMainWindow):
  67. def __init__(self):
  68. super().__init__()
  69. self.selected_category_1 = ""
  70. self.selected_category_2 = ""
  71. self.output_base_path = ""
  72. self.active_scraper = None
  73. self.initUI()
  74. self.load_default_categories()
  75. def initUI(self):
  76. self.setWindowTitle("1688 产品信息实时抓取工具 v3.0")
  77. self.setGeometry(100, 100, 1100, 750)
  78. # 设置窗口图标
  79. icon_path = get_resource_path("app.ico")
  80. if os.path.exists(icon_path):
  81. self.setWindowIcon(QIcon(icon_path))
  82. central_widget = QWidget()
  83. self.setCentralWidget(central_widget)
  84. main_layout = QHBoxLayout(central_widget)
  85. # 左侧类目树
  86. left_widget = QWidget()
  87. left_layout = QVBoxLayout(left_widget)
  88. self.load_category_btn = QPushButton("选择类目文件")
  89. self.load_category_btn.clicked.connect(self.select_category_file)
  90. self.category_tree = QTreeView()
  91. self.category_tree.setHeaderHidden(True)
  92. self.category_model = QStandardItemModel()
  93. self.category_tree.setModel(self.category_model)
  94. self.category_tree.clicked.connect(self.on_category_clicked)
  95. left_layout.addWidget(QLabel("<b>商品类目树</b>"))
  96. left_layout.addWidget(self.load_category_btn)
  97. left_layout.addWidget(self.category_tree)
  98. # 右侧操作区
  99. right_widget = QWidget()
  100. right_layout = QVBoxLayout(right_widget)
  101. opt_layout = QHBoxLayout()
  102. self.show_browser_cb = QCheckBox("显示浏览器界面 (手动过验证时勾选)")
  103. self.show_browser_cb.setChecked(True)
  104. opt_layout.addWidget(self.show_browser_cb)
  105. right_layout.addLayout(opt_layout)
  106. path_layout = QHBoxLayout()
  107. self.path_display = QLabel("未选择输出路径")
  108. self.path_display.setStyleSheet("color: gray; border: 1px solid #ccc; padding: 5px;")
  109. self.select_path_btn = QPushButton("选择输出目录")
  110. self.select_path_btn.clicked.connect(self.select_output_path)
  111. path_layout.addWidget(QLabel("<font color='red'>*</font>输出路径:"))
  112. path_layout.addWidget(self.path_display, 1)
  113. path_layout.addWidget(self.select_path_btn)
  114. right_layout.addLayout(path_layout)
  115. action_layout = QHBoxLayout()
  116. self.category_display = QLabel("请选择二级类目")
  117. # 抓取数量配置
  118. count_layout = QHBoxLayout()
  119. self.count_spin = QSpinBox()
  120. self.count_spin.setRange(1, 10000)
  121. self.count_spin.setValue(200)
  122. self.count_spin.setFixedWidth(80)
  123. count_layout.addWidget(QLabel("抓取数量:"))
  124. count_layout.addWidget(self.count_spin)
  125. self.search_btn = QPushButton("开始抓取")
  126. self.search_btn.setEnabled(False) # 初始置灰,直到选择类目和路径
  127. self.search_btn.clicked.connect(self.start_scraping)
  128. self.search_btn.setMinimumHeight(50)
  129. self.search_btn.setStyleSheet("""
  130. QPushButton {
  131. background-color: #0078d4;
  132. color: white;
  133. font-weight: bold;
  134. font-size: 16px;
  135. border-radius: 4px;
  136. }
  137. QPushButton:disabled {
  138. background-color: #cccccc;
  139. color: #888888;
  140. }
  141. """)
  142. action_layout.addWidget(QLabel("<font color='red'>*</font>检索类目:"))
  143. action_layout.addWidget(self.category_display, 1)
  144. action_layout.addLayout(count_layout)
  145. action_layout.addWidget(self.search_btn)
  146. right_layout.addLayout(action_layout)
  147. self.pbar = QProgressBar()
  148. self.log_output = QTextEdit()
  149. self.log_output.setReadOnly(True)
  150. right_layout.addWidget(QLabel("<b>任务日志:</b>"))
  151. right_layout.addWidget(self.log_output)
  152. right_layout.addWidget(self.pbar)
  153. self.status_label = QLabel("就绪")
  154. right_layout.addWidget(self.status_label)
  155. splitter = QSplitter(Qt.Orientation.Horizontal)
  156. splitter.addWidget(left_widget)
  157. splitter.addWidget(right_widget)
  158. splitter.setStretchFactor(1, 3)
  159. main_layout.addWidget(splitter)
  160. def load_default_categories(self):
  161. p = os.path.join('templates', '商品类目.xlsx')
  162. if os.path.exists(p): self.load_categories(p)
  163. def select_category_file(self):
  164. f, _ = QFileDialog.getOpenFileName(self, "选择类目文件", "templates", "Excel (*.xlsx)")
  165. if f: self.load_categories(f)
  166. def load_categories(self, f_path):
  167. try:
  168. df = pd.read_excel(f_path)
  169. c1, c2 = df.columns[0], df.columns[1]
  170. df[c1] = df[c1].ffill()
  171. self.category_model.clear()
  172. cats = {}
  173. for _, row in df.iterrows():
  174. v1, v2 = str(row[c1]), str(row[c2])
  175. if v1 not in cats:
  176. p = QStandardItem(v1); p.setSelectable(False)
  177. self.category_model.appendRow(p); cats[v1] = p
  178. child = QStandardItem(v2); child.setData(v1, Qt.ItemDataRole.UserRole)
  179. cats[v1].appendRow(child)
  180. self.category_tree.expandAll()
  181. except: pass
  182. def on_category_clicked(self, index):
  183. item = self.category_model.itemFromIndex(index)
  184. if item.isSelectable():
  185. self.selected_category_2 = item.text()
  186. self.selected_category_1 = item.data(Qt.ItemDataRole.UserRole)
  187. self.update_displays()
  188. def update_displays(self):
  189. if self.selected_category_1 and self.selected_category_2:
  190. self.category_display.setText(f"{self.selected_category_1} / <font color='#0078d4'><b>{self.selected_category_2}</b></font>")
  191. if self.output_base_path:
  192. full_p = os.path.normpath(os.path.join(self.output_base_path, "选品", self.selected_category_1, f"{self.selected_category_2}.xlsx"))
  193. self.path_display.setText(full_p)
  194. self.search_btn.setEnabled(True) # 仅在路径和类目都选好时启用按钮
  195. def select_output_path(self):
  196. p = QFileDialog.getExistingDirectory(self, "选择保存根目录")
  197. if p: self.output_base_path = p; self.update_displays()
  198. def start_scraping(self):
  199. if not self.selected_category_2 or not self.output_base_path:
  200. self.log_output.append("<font color='red'>[错误] 请选择类目和输出路径</font>")
  201. return
  202. target_dir = os.path.join(self.output_base_path, "选品", self.selected_category_1)
  203. file_path = os.path.normpath(os.path.join(target_dir, f"{self.selected_category_2}.xlsx"))
  204. self.current_output_file = file_path # 记录当前文件用于最后打开
  205. # 启动抓取前不再删除旧文件,实现断点续爬功能
  206. self.search_btn.setEnabled(False)
  207. self.count_spin.setEnabled(False) # 任务开始后也禁用数量输入
  208. self.status_label.setText("处理中……")
  209. self.log_output.clear()
  210. self.pbar.setValue(0)
  211. headless = not self.show_browser_cb.isChecked()
  212. total_count = self.count_spin.value()
  213. self.thread = ScraperThread(self.selected_category_2, file_path, total_count, headless)
  214. self.thread.log.connect(self.log_output.append)
  215. self.thread.progress.connect(self.pbar.setValue)
  216. self.thread.finished.connect(self.on_finished)
  217. self.thread.start()
  218. def on_finished(self, err, scraper, duration):
  219. self.search_btn.setEnabled(True)
  220. self.count_spin.setEnabled(True) # 任务结束后恢复数量输入
  221. if scraper: self.active_scraper = scraper
  222. if not err:
  223. self.status_label.setText("任务完成")
  224. # 自动打开目标文件
  225. if hasattr(self, 'current_output_file') and os.path.exists(self.current_output_file):
  226. try:
  227. os.startfile(self.current_output_file)
  228. self.log_output.append(f"<font color='blue'>[系统] 已自动打开结果文件</font>")
  229. except Exception as e:
  230. self.log_output.append(f"<font color='orange'>[警告] 无法自动打开文件: {e}</font>")
  231. else:
  232. self.status_label.setText("异常终止")
  233. if __name__ == "__main__":
  234. app = QApplication(sys.argv)
  235. win = MainWindow()
  236. win.show()
  237. sys.exit(app.exec())