LuTong hace 2 meses
padre
commit
1be22b1a75
Se han modificado 3 ficheros con 57 adiciones y 47 borrados
  1. 8 4
      src/excel_handler.py
  2. 22 2
      src/gui.py
  3. 27 41
      src/scraper.py

+ 8 - 4
src/excel_handler.py

@@ -27,10 +27,13 @@ def get_existing_info(file_path):
 
 def append_to_template(products, output_path, status_callback=None):
     template_path = get_resource_path(os.path.join('templates', '【进价】产品信息空表.xlsx'))
-    if not os.path.exists(template_path): template_path = os.path.join('templates', '【进价】产品信息空表.xlsx')
-    if not os.path.exists(template_path): raise FileNotFoundError(f"未找到核心模板文件: {template_path}")
+    if not os.path.exists(template_path):
+        template_path = os.path.join('templates', '【进价】产品信息空表.xlsx')
+    if not os.path.exists(template_path):
+        raise FileNotFoundError(f"未找到核心模板文件: {template_path}")
 
-    if os.path.exists(output_path): wb = load_workbook(output_path)
+    if os.path.exists(output_path):
+        wb = load_workbook(output_path)
     else:
         os.makedirs(os.path.dirname(output_path), exist_ok=True)
         wb = load_workbook(template_path)
@@ -65,7 +68,8 @@ def append_to_template(products, output_path, status_callback=None):
         ws.cell(row=row, column=12, value=product.get('supplier', ''))
         if product.get('link'): current_links.add(str(product['link']).strip())
 
-    if "统计状态" not in wb.sheetnames: wb.create_sheet("统计状态")
+    if "统计状态" not in wb.sheetnames:
+        wb.create_sheet("统计状态")
     ws_stat = wb["统计状态"]
     ws_stat.cell(row=1, column=1, value="已解析商品总数")
     ws_stat.cell(row=1, column=2, value=len(current_links))

+ 22 - 2
src/gui.py

@@ -13,7 +13,8 @@ from src.scraper import Scraper1688
 from src.excel_handler import append_to_template, get_existing_info
 
 def get_resource_path(relative_path):
-    if hasattr(sys, '_MEIPASS'): return os.path.join(sys._MEIPASS, relative_path)
+    if hasattr(sys, '_MEIPASS'):
+        return os.path.join(sys._MEIPASS, relative_path)
     return os.path.join(os.getcwd(), relative_path)
 
 class ScraperThread(QThread):
@@ -33,6 +34,7 @@ class ScraperThread(QThread):
         start_time = time.time()
         try:
             existing_links, _ = get_existing_info(self.output_path)
+            
             initial_p_count = 0
             if os.path.exists(self.output_path):
                 try:
@@ -45,20 +47,25 @@ class ScraperThread(QThread):
                 except: pass
 
             self.log.emit(f"<b>[*] 任务启动: {self.keyword}</b>")
+            
             def status_cb(is_waiting, msg):
                 if is_waiting: self.log.emit(f"<font color='red' size='5'><b>!!! {msg} !!!</b></font>")
                 else: self.log.emit(f"<font color='green'><b>[√] {msg}</b></font>")
 
             scraper = Scraper1688(headless=self.headless, status_callback=status_cb, log_callback=self.log.emit)
+            
             collected_count = 0
             product_index = initial_p_count
             
             for batch_results in scraper.search_products_yield(self.keyword, total_count=self.total_count, existing_links=existing_links):
                 append_to_template(batch_results, self.output_path, status_callback=status_cb)
+                
                 unique_links = len(set(item.get('link') for item in batch_results if item.get('link')))
                 product_index += unique_links
                 collected_count += len(batch_results)
+                
                 self.log.emit(f"[+] 解析到第 {product_index} 个商品,新增数据已持久化: {len(batch_results)} 条,本次共计: {collected_count}")
+                
                 current_task_done = product_index - initial_p_count
                 prog = int((current_task_done / self.total_count) * 100)
                 self.progress.emit(min(prog, 100))
@@ -88,9 +95,11 @@ class MainWindow(QMainWindow):
         self.setGeometry(100, 100, 1100, 750)
         icon_path = get_resource_path("app.ico")
         if os.path.exists(icon_path): self.setWindowIcon(QIcon(icon_path))
+
         central_widget = QWidget()
         self.setCentralWidget(central_widget)
         main_layout = QHBoxLayout(central_widget)
+
         left_widget = QWidget()
         left_layout = QVBoxLayout(left_widget)
         self.load_category_btn = QPushButton("选择类目文件")
@@ -103,13 +112,16 @@ class MainWindow(QMainWindow):
         left_layout.addWidget(QLabel("<b>商品类目树</b>"))
         left_layout.addWidget(self.load_category_btn)
         left_layout.addWidget(self.category_tree)
+
         right_widget = QWidget()
         right_layout = QVBoxLayout(right_widget)
+
         opt_layout = QHBoxLayout()
         self.show_browser_cb = QCheckBox("显示浏览器界面 (手动过验证时勾选)")
         self.show_browser_cb.setChecked(True)
         opt_layout.addWidget(self.show_browser_cb)
         right_layout.addLayout(opt_layout)
+
         path_layout = QHBoxLayout()
         self.path_display = QLabel("未选择输出路径")
         self.path_display.setStyleSheet("color: gray; border: 1px solid #ccc; padding: 5px;")
@@ -119,6 +131,7 @@ class MainWindow(QMainWindow):
         path_layout.addWidget(self.path_display, 1)
         path_layout.addWidget(self.select_path_btn)
         right_layout.addLayout(path_layout)
+
         action_layout = QHBoxLayout()
         self.category_display = QLabel("请选择二级类目")
         count_layout = QHBoxLayout()
@@ -128,16 +141,22 @@ class MainWindow(QMainWindow):
         self.count_spin.setFixedWidth(80)
         count_layout.addWidget(QLabel("抓取数量:"))
         count_layout.addWidget(self.count_spin)
+        
         self.search_btn = QPushButton("开始抓取")
         self.search_btn.setEnabled(False)
         self.search_btn.clicked.connect(self.start_scraping)
         self.search_btn.setMinimumHeight(50)
-        self.search_btn.setStyleSheet("QPushButton { background-color: #0078d4; color: white; font-weight: bold; font-size: 16px; border-radius: 4px; } QPushButton:disabled { background-color: #cccccc; color: #888888; }")
+        self.search_btn.setStyleSheet("""
+            QPushButton { background-color: #0078d4; color: white; font-weight: bold; font-size: 16px; border-radius: 4px; }
+            QPushButton:disabled { background-color: #cccccc; color: #888888; }
+        """)
+        
         action_layout.addWidget(QLabel("<font color='red'>*</font>检索类目:"))
         action_layout.addWidget(self.category_display, 1)
         action_layout.addLayout(count_layout)
         action_layout.addWidget(self.search_btn)
         right_layout.addLayout(action_layout)
+
         self.pbar = QProgressBar()
         self.log_output = QTextEdit()
         self.log_output.setReadOnly(True)
@@ -146,6 +165,7 @@ class MainWindow(QMainWindow):
         right_layout.addWidget(self.pbar)
         self.status_label = QLabel("就绪")
         right_layout.addWidget(self.status_label)
+
         splitter = QSplitter(Qt.Orientation.Horizontal)
         splitter.addWidget(left_widget)
         splitter.addWidget(right_widget)

+ 27 - 41
src/scraper.py

@@ -34,10 +34,7 @@ class Scraper1688:
 
     def _find_chrome(self):
         import winreg
-        reg_paths = [
-            (winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\chrome.exe"),
-            (winreg.HKEY_CURRENT_USER, r"SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\chrome.exe")
-        ]
+        reg_paths = [(winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\chrome.exe"), (winreg.HKEY_CURRENT_USER, r"SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\chrome.exe")]
         for hkey, subkey in reg_paths:
             try:
                 with winreg.OpenKey(hkey, subkey) as key:
@@ -73,7 +70,7 @@ class Scraper1688:
             return opts
         try:
             self.driver = uc.Chrome(options=create_options(), headless=headless, browser_executable_path=chrome_path, use_subprocess=True)
-        except:
+        except Exception:
             self.driver = uc.Chrome(options=create_options(), headless=headless, use_subprocess=True)
 
     def clean_url(self, url):
@@ -112,37 +109,29 @@ class Scraper1688:
             print(f"[*] 正在处理列表页: 第 {page} 页...")
             self.driver.get(f"{base_url}&beginPage={page}&page={page}")
             self.check_for_captcha()
-            for i in range(1, 9):
-                self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/8});")
-                time.sleep(random.uniform(1.2, 2.5))
-                if i == 4:
-                    self.driver.execute_script("window.scrollBy(0, -400);")
-                    time.sleep(1.0)
-            time.sleep(5)
+            for i in range(1, 6):
+                self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/5});")
+                time.sleep(1.5)
 
             page_results = self._extract_all_methods()
-            print(f"  [+] 本页发现 {len(page_results)} 个商品原始条目")
+            print(f"  [+] 本页发现 {len(page_results)} 个原始条目")
             
             page_batch = []
             for it in page_results:
-                clean_url = self.clean_url(it.get("link"))
+                clean_url = self.clean_url(it["link"])
                 if clean_url and clean_url not in all_links:
                     all_links.add(clean_url)
-                    print(f"  [>] 正在启动详情抓取: {clean_url}")
+                    print(f"  [>] 正在执行详情抓取流程: {clean_url}")
                     detail_results = self.scrape_detail(clean_url)
                     if detail_results:
                         page_batch.extend(detail_results)
                     else:
-                        page_batch.append({
-                            "category": "", "brand": "", "name": it.get("name", "未知"),
-                            "color": "", "spec": "", "material": "", "price": "",
-                            "moq": "", "wholesale_price": "", "link": clean_url, "supplier": ""
-                        })
+                        page_batch.append({"category": "", "brand": "", "name": it.get("name", "未知"), "color": "", "spec": "", "material": "", "price": "", "moq": "", "wholesale_price": "", "link": clean_url, "supplier": ""})
                     
                     if len(page_batch) >= 10:
                         yield page_batch
                         page_batch = []
-                    time.sleep(random.uniform(15, 25)) 
+                    time.sleep(random.uniform(5, 10)) 
                     if len(all_links) >= total_count + initial_count: break
             
             if page_batch: yield page_batch
@@ -153,10 +142,9 @@ class Scraper1688:
         return list(all_links)
 
     def scrape_detail(self, url):
-        """ 深度解析详情页,支持款式和逐条价格获取 """
         try:
             self.driver.get(url)
-            time.sleep(random.uniform(5, 10))
+            time.sleep(random.uniform(5, 8))
             self.check_for_captcha()
             model = self.driver.execute_script(
                 "return (window.context && window.context.result && window.context.result.global && window.context.result.global.globalData && window.context.result.global.globalData.model) || window.__INITIAL_DATA__ || window.iDetailData || window.iDetailConfig || null;"
@@ -197,28 +185,21 @@ class Scraper1688:
                         try:
                             label = item_el.find_element(By.CLASS_NAME, "item-label").text.strip()
                             price = item_el.find_element(By.CLASS_NAME, "item-price-stock").text.strip()
-                            if label:
-                                variant_data_list.append({"label": label, "price": re.sub(r'[^\d.]', '', price)})
+                            if label: variant_data_list.append({"label": label, "price": re.sub(r'[^\d.]', '', price)})
                         except: continue
             except: pass
 
             if variant_data_list:
                 results = []
                 for vd in variant_data_list:
-                    row = base_data.copy()
-                    row["color"] = vd["label"]
-                    row["price"] = vd["price"]
-                    results.append(row)
+                    row = base_data.copy(); row["color"] = vd["label"]; row["price"] = vd["price"]; results.append(row)
                 return results
             return [base_data]
         except: return None
 
     def _extract_all_methods(self):
         results = []
-        scripts = [
-            "return JSON.stringify(window.data || window.context?.result?.data || window.__INITIAL_DATA__)",
-            "return JSON.stringify(window.context?.result?.global?.globalData?.data || null)"
-        ]
+        scripts = ["return JSON.stringify(window.data || window.context?.result?.data || window.__INITIAL_DATA__)", "return JSON.stringify(window.context?.result?.global?.globalData?.data || null)"]
         for s in scripts:
             try:
                 res = self.driver.execute_script(s)
@@ -236,14 +217,19 @@ class Scraper1688:
                         if link: results.append({"name": str(o.get('title', o.get('subject', ''))), "link": link})
                     if results: return results
             except: continue
-        for s in [".sm-offer-item", ".offer-card-item", "[class*='offer-card']", ".offer-item"]:
-            for el in self.driver.find_elements(By.CSS_SELECTOR, s):
-                try:
-                    a = el.find_element(By.TAG_NAME, "a")
-                    link = a.get_attribute("href")
-                    if link and "1688.com" in link: results.append({"name": el.text.split('\n')[0][:50], "link": link})
-                except: continue
-            if results: break
+        
+        links = self.driver.find_elements(By.TAG_NAME, "a")
+        seen_ids = set()
+        for l in links:
+            try:
+                href = l.get_attribute("href")
+                if href:
+                    id_match = re.search(r'offer/(\d{9,15})\.html', href)
+                    if id_match:
+                        oid = id_match.group(1)
+                        if oid not in seen_ids:
+                            seen_ids.add(oid); results.append({"name": l.text.split('\n')[0][:50] or f"商品-{oid}", "link": href})
+            except: continue
         return results
 
     def quit(self):