há 4 meses atrás · 1be22b1a75
--- a/src/excel_handler.py
+++ b/src/excel_handler.py
@@ -27,10 +27,13 @@ def get_existing_info(file_path):
 
				 
			
 
				 def append_to_template(products, output_path, status_callback=None):
			
 
				     template_path = get_resource_path(os.path.join('templates', '【进价】产品信息空表.xlsx'))
			
 
				-    if not os.path.exists(template_path): template_path = os.path.join('templates', '【进价】产品信息空表.xlsx')
			
 
				-    if not os.path.exists(template_path): raise FileNotFoundError(f"未找到核心模板文件: {template_path}")
			
 
				+    if not os.path.exists(template_path):
			
 
				+        template_path = os.path.join('templates', '【进价】产品信息空表.xlsx')
			
 
				+    if not os.path.exists(template_path):
			
 
				+        raise FileNotFoundError(f"未找到核心模板文件: {template_path}")
			
 
				 
			
 
				-    if os.path.exists(output_path): wb = load_workbook(output_path)
			
 
				+    if os.path.exists(output_path):
			
 
				+        wb = load_workbook(output_path)
			
 
				     else:
			
 
				         os.makedirs(os.path.dirname(output_path), exist_ok=True)
			
 
				         wb = load_workbook(template_path)
			
@@ -65,7 +68,8 @@ def append_to_template(products, output_path, status_callback=None):
 
				         ws.cell(row=row, column=12, value=product.get('supplier', ''))
			
 
				         if product.get('link'): current_links.add(str(product['link']).strip())
			
 
				 
			
 
				-    if "统计状态" not in wb.sheetnames: wb.create_sheet("统计状态")
			
 
				+    if "统计状态" not in wb.sheetnames:
			
 
				+        wb.create_sheet("统计状态")
			
 
				     ws_stat = wb["统计状态"]
			
 
				     ws_stat.cell(row=1, column=1, value="已解析商品总数")
			
 
				     ws_stat.cell(row=1, column=2, value=len(current_links))
			
--- a/src/gui.py
+++ b/src/gui.py
@@ -13,7 +13,8 @@ from src.scraper import Scraper1688
 
				 from src.excel_handler import append_to_template, get_existing_info
			
 
				 
			
 
				 def get_resource_path(relative_path):
			
 
				-    if hasattr(sys, '_MEIPASS'): return os.path.join(sys._MEIPASS, relative_path)
			
 
				+    if hasattr(sys, '_MEIPASS'):
			
 
				+        return os.path.join(sys._MEIPASS, relative_path)
			
 
				     return os.path.join(os.getcwd(), relative_path)
			
 
				 
			
 
				 class ScraperThread(QThread):
			
@@ -33,6 +34,7 @@ class ScraperThread(QThread):
 
				         start_time = time.time()
			
 
				         try:
			
 
				             existing_links, _ = get_existing_info(self.output_path)
			
 
				+            
			
 
				             initial_p_count = 0
			
 
				             if os.path.exists(self.output_path):
			
 
				                 try:
			
@@ -45,20 +47,25 @@ class ScraperThread(QThread):
 
				                 except: pass
			
 
				 
			
 
				             self.log.emit(f"<b>[*] 任务启动: {self.keyword}</b>")
			
 
				+            
			
 
				             def status_cb(is_waiting, msg):
			
 
				                 if is_waiting: self.log.emit(f"<font color='red' size='5'><b>!!! {msg} !!!</b></font>")
			
 
				                 else: self.log.emit(f"<font color='green'><b>[√] {msg}</b></font>")
			
 
				 
			
 
				             scraper = Scraper1688(headless=self.headless, status_callback=status_cb, log_callback=self.log.emit)
			
 
				+            
			
 
				             collected_count = 0
			
 
				             product_index = initial_p_count
			
 
				             
			
 
				             for batch_results in scraper.search_products_yield(self.keyword, total_count=self.total_count, existing_links=existing_links):
			
 
				                 append_to_template(batch_results, self.output_path, status_callback=status_cb)
			
 
				+                
			
 
				                 unique_links = len(set(item.get('link') for item in batch_results if item.get('link')))
			
 
				                 product_index += unique_links
			
 
				                 collected_count += len(batch_results)
			
 
				+                
			
 
				                 self.log.emit(f"[+] 解析到第 {product_index} 个商品，新增数据已持久化: {len(batch_results)} 条，本次共计: {collected_count}")
			
 
				+                
			
 
				                 current_task_done = product_index - initial_p_count
			
 
				                 prog = int((current_task_done / self.total_count) * 100)
			
 
				                 self.progress.emit(min(prog, 100))
			
@@ -88,9 +95,11 @@ class MainWindow(QMainWindow):
 
				         self.setGeometry(100, 100, 1100, 750)
			
 
				         icon_path = get_resource_path("app.ico")
			
 
				         if os.path.exists(icon_path): self.setWindowIcon(QIcon(icon_path))
			
 
				+
			
 
				         central_widget = QWidget()
			
 
				         self.setCentralWidget(central_widget)
			
 
				         main_layout = QHBoxLayout(central_widget)
			
 
				+
			
 
				         left_widget = QWidget()
			
 
				         left_layout = QVBoxLayout(left_widget)
			
 
				         self.load_category_btn = QPushButton("选择类目文件")
			
@@ -103,13 +112,16 @@ class MainWindow(QMainWindow):
 
				         left_layout.addWidget(QLabel("<b>商品类目树</b>"))
			
 
				         left_layout.addWidget(self.load_category_btn)
			
 
				         left_layout.addWidget(self.category_tree)
			
 
				+
			
 
				         right_widget = QWidget()
			
 
				         right_layout = QVBoxLayout(right_widget)
			
 
				+
			
 
				         opt_layout = QHBoxLayout()
			
 
				         self.show_browser_cb = QCheckBox("显示浏览器界面 (手动过验证时勾选)")
			
 
				         self.show_browser_cb.setChecked(True)
			
 
				         opt_layout.addWidget(self.show_browser_cb)
			
 
				         right_layout.addLayout(opt_layout)
			
 
				+
			
 
				         path_layout = QHBoxLayout()
			
 
				         self.path_display = QLabel("未选择输出路径")
			
 
				         self.path_display.setStyleSheet("color: gray; border: 1px solid #ccc; padding: 5px;")
			
@@ -119,6 +131,7 @@ class MainWindow(QMainWindow):
 
				         path_layout.addWidget(self.path_display, 1)
			
 
				         path_layout.addWidget(self.select_path_btn)
			
 
				         right_layout.addLayout(path_layout)
			
 
				+
			
 
				         action_layout = QHBoxLayout()
			
 
				         self.category_display = QLabel("请选择二级类目")
			
 
				         count_layout = QHBoxLayout()
			
@@ -128,16 +141,22 @@ class MainWindow(QMainWindow):
 
				         self.count_spin.setFixedWidth(80)
			
 
				         count_layout.addWidget(QLabel("抓取数量:"))
			
 
				         count_layout.addWidget(self.count_spin)
			
 
				+        
			
 
				         self.search_btn = QPushButton("开始抓取")
			
 
				         self.search_btn.setEnabled(False)
			
 
				         self.search_btn.clicked.connect(self.start_scraping)
			
 
				         self.search_btn.setMinimumHeight(50)
			
 
				-        self.search_btn.setStyleSheet("QPushButton { background-color: #0078d4; color: white; font-weight: bold; font-size: 16px; border-radius: 4px; } QPushButton:disabled { background-color: #cccccc; color: #888888; }")
			
 
				+        self.search_btn.setStyleSheet("""
			
 
				+            QPushButton { background-color: #0078d4; color: white; font-weight: bold; font-size: 16px; border-radius: 4px; }
			
 
				+            QPushButton:disabled { background-color: #cccccc; color: #888888; }
			
 
				+        """)
			
 
				+        
			
 
				         action_layout.addWidget(QLabel("<font color='red'>*</font>检索类目:"))
			
 
				         action_layout.addWidget(self.category_display, 1)
			
 
				         action_layout.addLayout(count_layout)
			
 
				         action_layout.addWidget(self.search_btn)
			
 
				         right_layout.addLayout(action_layout)
			
 
				+
			
 
				         self.pbar = QProgressBar()
			
 
				         self.log_output = QTextEdit()
			
 
				         self.log_output.setReadOnly(True)
			
@@ -146,6 +165,7 @@ class MainWindow(QMainWindow):
 
				         right_layout.addWidget(self.pbar)
			
 
				         self.status_label = QLabel("就绪")
			
 
				         right_layout.addWidget(self.status_label)
			
 
				+
			
 
				         splitter = QSplitter(Qt.Orientation.Horizontal)
			
 
				         splitter.addWidget(left_widget)
			
 
				         splitter.addWidget(right_widget)
			
--- a/src/scraper.py
+++ b/src/scraper.py
@@ -34,10 +34,7 @@ class Scraper1688:
 
				 
			
 
				     def _find_chrome(self):
			
 
				         import winreg
			
 
				-        reg_paths = [
			
 
				-            (winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\chrome.exe"),
			
 
				-            (winreg.HKEY_CURRENT_USER, r"SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\chrome.exe")
			
 
				-        ]
			
 
				+        reg_paths = [(winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\chrome.exe"), (winreg.HKEY_CURRENT_USER, r"SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\chrome.exe")]
			
 
				         for hkey, subkey in reg_paths:
			
 
				             try:
			
 
				                 with winreg.OpenKey(hkey, subkey) as key:
			
@@ -73,7 +70,7 @@ class Scraper1688:
 
				             return opts
			
 
				         try:
			
 
				             self.driver = uc.Chrome(options=create_options(), headless=headless, browser_executable_path=chrome_path, use_subprocess=True)
			
 
				-        except:
			
 
				+        except Exception:
			
 
				             self.driver = uc.Chrome(options=create_options(), headless=headless, use_subprocess=True)
			
 
				 
			
 
				     def clean_url(self, url):
			
@@ -112,37 +109,29 @@ class Scraper1688:
 
				             print(f"[*] 正在处理列表页: 第 {page} 页...")
			
 
				             self.driver.get(f"{base_url}&beginPage={page}&page={page}")
			
 
				             self.check_for_captcha()
			
 
				-            for i in range(1, 9):
			
 
				-                self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/8});")
			
 
				-                time.sleep(random.uniform(1.2, 2.5))
			
 
				-                if i == 4:
			
 
				-                    self.driver.execute_script("window.scrollBy(0, -400);")
			
 
				-                    time.sleep(1.0)
			
 
				-            time.sleep(5)
			
 
				+            for i in range(1, 6):
			
 
				+                self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/5});")
			
 
				+                time.sleep(1.5)
			
 
				 
			
 
				             page_results = self._extract_all_methods()
			
 
				-            print(f"  [+] 本页发现 {len(page_results)} 个商品原始条目")
			
 
				+            print(f"  [+] 本页发现 {len(page_results)} 个原始条目")
			
 
				             
			
 
				             page_batch = []
			
 
				             for it in page_results:
			
 
				-                clean_url = self.clean_url(it.get("link"))
			
 
				+                clean_url = self.clean_url(it["link"])
			
 
				                 if clean_url and clean_url not in all_links:
			
 
				                     all_links.add(clean_url)
			
 
				-                    print(f"  [>] 正在启动详情抓取: {clean_url}")
			
 
				+                    print(f"  [>] 正在执行详情抓取流程: {clean_url}")
			
 
				                     detail_results = self.scrape_detail(clean_url)
			
 
				                     if detail_results:
			
 
				                         page_batch.extend(detail_results)
			
 
				                     else:
			
 
				-                        page_batch.append({
			
 
				-                            "category": "", "brand": "", "name": it.get("name", "未知"),
			
 
				-                            "color": "", "spec": "", "material": "", "price": "",
			
 
				-                            "moq": "", "wholesale_price": "", "link": clean_url, "supplier": ""
			
 
				-                        })
			
 
				+                        page_batch.append({"category": "", "brand": "", "name": it.get("name", "未知"), "color": "", "spec": "", "material": "", "price": "", "moq": "", "wholesale_price": "", "link": clean_url, "supplier": ""})
			
 
				                     
			
 
				                     if len(page_batch) >= 10:
			
 
				                         yield page_batch
			
 
				                         page_batch = []
			
 
				-                    time.sleep(random.uniform(15, 25)) 
			
 
				+                    time.sleep(random.uniform(5, 10)) 
			
 
				                     if len(all_links) >= total_count + initial_count: break
			
 
				             
			
 
				             if page_batch: yield page_batch
			
@@ -153,10 +142,9 @@ class Scraper1688:
 
				         return list(all_links)
			
 
				 
			
 
				     def scrape_detail(self, url):
			
 
				-        """ 深度解析详情页，支持款式和逐条价格获取 """
			
 
				         try:
			
 
				             self.driver.get(url)
			
 
				-            time.sleep(random.uniform(5, 10))
			
 
				+            time.sleep(random.uniform(5, 8))
			
 
				             self.check_for_captcha()
			
 
				             model = self.driver.execute_script(
			
 
				                 "return (window.context && window.context.result && window.context.result.global && window.context.result.global.globalData && window.context.result.global.globalData.model) || window.__INITIAL_DATA__ || window.iDetailData || window.iDetailConfig || null;"
			
@@ -197,28 +185,21 @@ class Scraper1688:
 
				                         try:
			
 
				                             label = item_el.find_element(By.CLASS_NAME, "item-label").text.strip()
			
 
				                             price = item_el.find_element(By.CLASS_NAME, "item-price-stock").text.strip()
			
 
				-                            if label:
			
 
				-                                variant_data_list.append({"label": label, "price": re.sub(r'[^\d.]', '', price)})
			
 
				+                            if label: variant_data_list.append({"label": label, "price": re.sub(r'[^\d.]', '', price)})
			
 
				                         except: continue
			
 
				             except: pass
			
 
				 
			
 
				             if variant_data_list:
			
 
				                 results = []
			
 
				                 for vd in variant_data_list:
			
 
				-                    row = base_data.copy()
			
 
				-                    row["color"] = vd["label"]
			
 
				-                    row["price"] = vd["price"]
			
 
				-                    results.append(row)
			
 
				+                    row = base_data.copy(); row["color"] = vd["label"]; row["price"] = vd["price"]; results.append(row)
			
 
				                 return results
			
 
				             return [base_data]
			
 
				         except: return None
			
 
				 
			
 
				     def _extract_all_methods(self):
			
 
				         results = []
			
 
				-        scripts = [
			
 
				-            "return JSON.stringify(window.data || window.context?.result?.data || window.__INITIAL_DATA__)",
			
 
				-            "return JSON.stringify(window.context?.result?.global?.globalData?.data || null)"
			
 
				-        ]
			
 
				+        scripts = ["return JSON.stringify(window.data || window.context?.result?.data || window.__INITIAL_DATA__)", "return JSON.stringify(window.context?.result?.global?.globalData?.data || null)"]
			
 
				         for s in scripts:
			
 
				             try:
			
 
				                 res = self.driver.execute_script(s)
			
@@ -236,14 +217,19 @@ class Scraper1688:
 
				                         if link: results.append({"name": str(o.get('title', o.get('subject', ''))), "link": link})
			
 
				                     if results: return results
			
 
				             except: continue
			
 
				-        for s in [".sm-offer-item", ".offer-card-item", "[class*='offer-card']", ".offer-item"]:
			
 
				-            for el in self.driver.find_elements(By.CSS_SELECTOR, s):
			
 
				-                try:
			
 
				-                    a = el.find_element(By.TAG_NAME, "a")
			
 
				-                    link = a.get_attribute("href")
			
 
				-                    if link and "1688.com" in link: results.append({"name": el.text.split('\n')[0][:50], "link": link})
			
 
				-                except: continue
			
 
				-            if results: break
			
 
				+        
			
 
				+        links = self.driver.find_elements(By.TAG_NAME, "a")
			
 
				+        seen_ids = set()
			
 
				+        for l in links:
			
 
				+            try:
			
 
				+                href = l.get_attribute("href")
			
 
				+                if href:
			
 
				+                    id_match = re.search(r'offer/(\d{9,15})\.html', href)
			
 
				+                    if id_match:
			
 
				+                        oid = id_match.group(1)
			
 
				+                        if oid not in seen_ids:
			
 
				+                            seen_ids.add(oid); results.append({"name": l.text.split('\n')[0][:50] or f"商品-{oid}", "link": href})
			
 
				+            except: continue
			
 
				         return results
			
 
				 
			
 
				     def quit(self):