LuTong vor 3 Monaten
Ursprung
Commit
e549f8ad74
3 geänderte Dateien mit 50 neuen und 114 gelöschten Zeilen
  1. 8 35
      src/excel_handler.py
  2. 1 18
      src/gui.py
  3. 41 61
      src/scraper.py

+ 8 - 35
src/excel_handler.py

@@ -4,70 +4,46 @@ import time
 from openpyxl import load_workbook
 
 def get_resource_path(relative_path):
-    """ 获取资源绝对路径,兼容开发环境和 PyInstaller 打包环境 """
     if hasattr(sys, '_MEIPASS'):
         return os.path.join(sys._MEIPASS, relative_path)
     base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
     return os.path.join(base_dir, relative_path)
 
 def get_existing_info(file_path):
-    """
-    读取已有文件中的链接和最后一行编码
-    """
     links = set()
     last_code = 0
     if not os.path.exists(file_path):
         return links, last_code
-    
     try:
         wb = load_workbook(file_path, data_only=True)
         ws = wb.active
-        # 假设 A 列是编码,K 列是链接
         for r in range(3, ws.max_row + 1):
             code_val = ws.cell(row=r, column=1).value
             link_val = ws.cell(row=r, column=11).value
-            
-            if link_val:
-                links.add(str(link_val).strip())
-            
-            if isinstance(code_val, (int, float)):
-                last_code = max(last_code, int(code_val))
-    except:
-        pass
+            if link_val: links.add(str(link_val).strip())
+            if isinstance(code_val, (int, float)): last_code = max(last_code, int(code_val))
+    except: pass
     return links, last_code
 
 def append_to_template(products, output_path, status_callback=None):
-    """
-    将产品数据追加写入到指定的 Excel 文件中。
-    并在第二个 Sheet 中记录商品总数。
-    """
     template_path = get_resource_path(os.path.join('templates', '【进价】产品信息空表.xlsx'))
-    
-    if not os.path.exists(template_path):
-        template_path = os.path.join('templates', '【进价】产品信息空表.xlsx')
-
-    if not os.path.exists(template_path):
-        raise FileNotFoundError(f"未找到核心模板文件: {template_path}")
+    if not os.path.exists(template_path): template_path = os.path.join('templates', '【进价】产品信息空表.xlsx')
+    if not os.path.exists(template_path): raise FileNotFoundError(f"未找到核心模板文件: {template_path}")
 
-    if os.path.exists(output_path):
-        wb = load_workbook(output_path)
+    if os.path.exists(output_path): wb = load_workbook(output_path)
     else:
         os.makedirs(os.path.dirname(output_path), exist_ok=True)
         wb = load_workbook(template_path)
     
     ws = wb.active
-    
-    # 寻找起始行 (基于第 11 列“产品链接”进行判定)
     start_row = 3
     for r in range(3, ws.max_row + 2):
         val_link = ws.cell(row=r, column=11).value
         if val_link is None or str(val_link).strip() == "":
             start_row = r
             break
-    else:
-        start_row = ws.max_row + 1
+    else: start_row = ws.max_row + 1
     
-    # 获取当前已有的链接集合用于统计
     current_links = set()
     for r in range(3, start_row):
         link = ws.cell(row=r, column=11).value
@@ -87,12 +63,9 @@ def append_to_template(products, output_path, status_callback=None):
         ws.cell(row=row, column=10, value=product.get('wholesale_price', ''))
         ws.cell(row=row, column=11, value=product.get('link', '')) 
         ws.cell(row=row, column=12, value=product.get('supplier', ''))
-        
         if product.get('link'): current_links.add(str(product['link']).strip())
 
-    # 写入/更新统计 Sheet
-    if "统计状态" not in wb.sheetnames:
-        wb.create_sheet("统计状态")
+    if "统计状态" not in wb.sheetnames: wb.create_sheet("统计状态")
     ws_stat = wb["统计状态"]
     ws_stat.cell(row=1, column=1, value="已解析商品总数")
     ws_stat.cell(row=1, column=2, value=len(current_links))

+ 1 - 18
src/gui.py

@@ -13,8 +13,7 @@ from src.scraper import Scraper1688
 from src.excel_handler import append_to_template, get_existing_info
 
 def get_resource_path(relative_path):
-    if hasattr(sys, '_MEIPASS'):
-        return os.path.join(sys._MEIPASS, relative_path)
+    if hasattr(sys, '_MEIPASS'): return os.path.join(sys._MEIPASS, relative_path)
     return os.path.join(os.getcwd(), relative_path)
 
 class ScraperThread(QThread):
@@ -34,7 +33,6 @@ class ScraperThread(QThread):
         start_time = time.time()
         try:
             existing_links, _ = get_existing_info(self.output_path)
-            
             initial_p_count = 0
             if os.path.exists(self.output_path):
                 try:
@@ -47,25 +45,20 @@ class ScraperThread(QThread):
                 except: pass
 
             self.log.emit(f"<b>[*] 任务启动: {self.keyword}</b>")
-            
             def status_cb(is_waiting, msg):
                 if is_waiting: self.log.emit(f"<font color='red' size='5'><b>!!! {msg} !!!</b></font>")
                 else: self.log.emit(f"<font color='green'><b>[√] {msg}</b></font>")
 
             scraper = Scraper1688(headless=self.headless, status_callback=status_cb, log_callback=self.log.emit)
-            
             collected_count = 0
             product_index = initial_p_count
             
             for batch_results in scraper.search_products_yield(self.keyword, total_count=self.total_count, existing_links=existing_links):
                 append_to_template(batch_results, self.output_path, status_callback=status_cb)
-                
                 unique_links = len(set(item.get('link') for item in batch_results if item.get('link')))
                 product_index += unique_links
                 collected_count += len(batch_results)
-                
                 self.log.emit(f"[+] 解析到第 {product_index} 个商品,新增数据已持久化: {len(batch_results)} 条,本次共计: {collected_count}")
-                
                 current_task_done = product_index - initial_p_count
                 prog = int((current_task_done / self.total_count) * 100)
                 self.progress.emit(min(prog, 100))
@@ -95,11 +88,9 @@ class MainWindow(QMainWindow):
         self.setGeometry(100, 100, 1100, 750)
         icon_path = get_resource_path("app.ico")
         if os.path.exists(icon_path): self.setWindowIcon(QIcon(icon_path))
-
         central_widget = QWidget()
         self.setCentralWidget(central_widget)
         main_layout = QHBoxLayout(central_widget)
-
         left_widget = QWidget()
         left_layout = QVBoxLayout(left_widget)
         self.load_category_btn = QPushButton("选择类目文件")
@@ -112,16 +103,13 @@ class MainWindow(QMainWindow):
         left_layout.addWidget(QLabel("<b>商品类目树</b>"))
         left_layout.addWidget(self.load_category_btn)
         left_layout.addWidget(self.category_tree)
-
         right_widget = QWidget()
         right_layout = QVBoxLayout(right_widget)
-
         opt_layout = QHBoxLayout()
         self.show_browser_cb = QCheckBox("显示浏览器界面 (手动过验证时勾选)")
         self.show_browser_cb.setChecked(True)
         opt_layout.addWidget(self.show_browser_cb)
         right_layout.addLayout(opt_layout)
-
         path_layout = QHBoxLayout()
         self.path_display = QLabel("未选择输出路径")
         self.path_display.setStyleSheet("color: gray; border: 1px solid #ccc; padding: 5px;")
@@ -131,7 +119,6 @@ class MainWindow(QMainWindow):
         path_layout.addWidget(self.path_display, 1)
         path_layout.addWidget(self.select_path_btn)
         right_layout.addLayout(path_layout)
-
         action_layout = QHBoxLayout()
         self.category_display = QLabel("请选择二级类目")
         count_layout = QHBoxLayout()
@@ -141,19 +128,16 @@ class MainWindow(QMainWindow):
         self.count_spin.setFixedWidth(80)
         count_layout.addWidget(QLabel("抓取数量:"))
         count_layout.addWidget(self.count_spin)
-        
         self.search_btn = QPushButton("开始抓取")
         self.search_btn.setEnabled(False)
         self.search_btn.clicked.connect(self.start_scraping)
         self.search_btn.setMinimumHeight(50)
         self.search_btn.setStyleSheet("QPushButton { background-color: #0078d4; color: white; font-weight: bold; font-size: 16px; border-radius: 4px; } QPushButton:disabled { background-color: #cccccc; color: #888888; }")
-        
         action_layout.addWidget(QLabel("<font color='red'>*</font>检索类目:"))
         action_layout.addWidget(self.category_display, 1)
         action_layout.addLayout(count_layout)
         action_layout.addWidget(self.search_btn)
         right_layout.addLayout(action_layout)
-
         self.pbar = QProgressBar()
         self.log_output = QTextEdit()
         self.log_output.setReadOnly(True)
@@ -162,7 +146,6 @@ class MainWindow(QMainWindow):
         right_layout.addWidget(self.pbar)
         self.status_label = QLabel("就绪")
         right_layout.addWidget(self.status_label)
-
         splitter = QSplitter(Qt.Orientation.Horizontal)
         splitter.addWidget(left_widget)
         splitter.addWidget(right_widget)

+ 41 - 61
src/scraper.py

@@ -1,4 +1,3 @@
-# 【版本:20260115-终极订正版】
 # 针对 Python 3.12+ 移除 distutils 的兼容性补丁
 import sys
 try:
@@ -34,7 +33,6 @@ class Scraper1688:
             stealth(self.driver, languages=["zh-CN", "zh"], vendor="Google Inc.", platform="Win32", fix_hairline=True)
 
     def _find_chrome(self):
-        """ 强力锁定 Chrome 安装路径 """
         import winreg
         reg_paths = [
             (winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\chrome.exe"),
@@ -75,21 +73,16 @@ class Scraper1688:
             return opts
         try:
             self.driver = uc.Chrome(options=create_options(), headless=headless, browser_executable_path=chrome_path, use_subprocess=True)
-        except Exception as e:
+        except:
             self.driver = uc.Chrome(options=create_options(), headless=headless, use_subprocess=True)
 
     def clean_url(self, url):
-        """ 【关键订正】极其简化的 ID 提取逻辑,只要是商品就必须进入详情页 """
         if not url: return ""
-        # 强制转换为字符串并处理
         url_str = str(url)
         if url_str.startswith("//"): url_str = "https:" + url_str
-        
-        # 只要能匹配到连续的 9-15 位数字(1688 商品 ID 特征),就重组
         id_match = re.search(r'(\d{9,15})', url_str)
         if id_match:
-            standard_url = f"https://detail.1688.com/offer/{id_match.group(1)}.html"
-            return standard_url
+            return f"https://detail.1688.com/offer/{id_match.group(1)}.html"
         return ""
 
     def check_for_captcha(self):
@@ -112,7 +105,6 @@ class Scraper1688:
         base_url = f"https://s.1688.com/selloffer/offer_search.htm?keywords={gbk_keyword}&n=y&netType=1%2C11%2C16"
         self.driver.get("https://www.1688.com")
         self.check_for_captcha()
-        
         all_links = existing_links if existing_links is not None else set()
         page, initial_count = 1, len(all_links)
         
@@ -120,47 +112,38 @@ class Scraper1688:
             print(f"[*] 正在处理列表页: 第 {page} 页...")
             self.driver.get(f"{base_url}&beginPage={page}&page={page}")
             self.check_for_captcha()
-            for i in range(1, 5):
-                self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/4});")
-                time.sleep(1.2)
+            for i in range(1, 9):
+                self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/8});")
+                time.sleep(random.uniform(1.2, 2.5))
+                if i == 4:
+                    self.driver.execute_script("window.scrollBy(0, -400);")
+                    time.sleep(1.0)
+            time.sleep(5)
 
-            # 获取本页链接 (完全对标 req.py 变量探测)
             page_results = self._extract_all_methods()
-            print(f"  [+] 本页发现 {len(page_results)} 个原始条目")
+            print(f"  [+] 本页发现 {len(page_results)} 个商品原始条目")
             
             page_batch = []
             for it in page_results:
-                raw_link = it.get("link")
-                clean_url = self.clean_url(raw_link)
-                
-                if not clean_url:
-                    continue
-                
-                if clean_url in all_links:
-                    print(f"  [-] 跳过已存在商品: {clean_url}")
-                    continue
-
-                all_links.add(clean_url)
-                # 【强制日志】只要进入这里,就一定会打印并执行详情抓取
-                print(f"  [>] 正在执行详情抓取流程: {clean_url}")
-                
-                detail_results = self.scrape_detail(clean_url)
-                if detail_results:
-                    page_batch.extend(detail_results)
-                else:
-                    # 即使详情失败也记录基本信息,防止死循环
-                    page_batch.append({
-                        "category": "", "brand": "", "name": it.get("name", "未知"),
-                        "color": "", "spec": "", "material": "", "price": "",
-                        "moq": "", "wholesale_price": "", "link": clean_url, "supplier": ""
-                    })
-                
-                if len(page_batch) >= 10:
-                    yield page_batch
-                    page_batch = []
-                
-                time.sleep(random.uniform(15, 25)) 
-                if len(all_links) >= total_count + initial_count: break
+                clean_url = self.clean_url(it.get("link"))
+                if clean_url and clean_url not in all_links:
+                    all_links.add(clean_url)
+                    print(f"  [>] 正在启动详情抓取: {clean_url}")
+                    detail_results = self.scrape_detail(clean_url)
+                    if detail_results:
+                        page_batch.extend(detail_results)
+                    else:
+                        page_batch.append({
+                            "category": "", "brand": "", "name": it.get("name", "未知"),
+                            "color": "", "spec": "", "material": "", "price": "",
+                            "moq": "", "wholesale_price": "", "link": clean_url, "supplier": ""
+                        })
+                    
+                    if len(page_batch) >= 10:
+                        yield page_batch
+                        page_batch = []
+                    time.sleep(random.uniform(15, 25)) 
+                    if len(all_links) >= total_count + initial_count: break
             
             if page_batch: yield page_batch
             page += 1
@@ -170,17 +153,13 @@ class Scraper1688:
         return list(all_links)
 
     def scrape_detail(self, url):
-        """ 精准解析:完全同步自 req.py 的模型获取逻辑 """
+        """ 深度解析详情页,支持款式和逐条价格获取 """
         try:
             self.driver.get(url)
-            time.sleep(random.uniform(5, 8))
+            time.sleep(random.uniform(5, 10))
             self.check_for_captcha()
-            # 执行 JS 获取核心模型 (完全对标 req.py)
             model = self.driver.execute_script(
-                "return (window.context && window.context.result && "
-                "window.context.result.global && window.context.result.global.globalData "
-                "&& window.context.result.global.globalData.model) || "
-                "window.__INITIAL_DATA__ || window.iDetailData || window.iDetailConfig || null;"
+                "return (window.context && window.context.result && window.context.result.global && window.context.result.global.globalData && window.context.result.global.globalData.model) || window.__INITIAL_DATA__ || window.iDetailData || window.iDetailConfig || null;"
             )
             if not model: return None
 
@@ -196,7 +175,8 @@ class Scraper1688:
                 return ""
 
             trade = model.get("tradeModel", {}) if isinstance(model, dict) else {}
-            range_text = " / ".join([f"{r.get('beginAmount')}起 ¥{r.get('price') or r.get('discountPrice')}" for r in (trade.get("disPriceRanges") or trade.get("currentPrices") or [])])
+            ranges = trade.get("disPriceRanges") or trade.get("currentPrices") or []
+            range_text = " / ".join([f"{r.get('beginAmount')}起 ¥{r.get('price') or r.get('discountPrice')}" for r in ranges])
 
             base_data = {
                 "category": (model.get("offerDetail", {}).get("leafCategoryName", "") if isinstance(model, dict) else "") or self.driver.find_element(By.CSS_SELECTOR, "div[class*=breadcrumb] a:last-child").text.strip(),
@@ -210,7 +190,6 @@ class Scraper1688:
 
             variant_data_list = []
             try:
-                # 方案 A: 优先使用 expand-view-list-wrapper 获取款式和价格
                 wrappers = self.driver.find_elements(By.CLASS_NAME, "expand-view-list-wrapper")
                 if wrappers:
                     items = wrappers[0].find_elements(By.CSS_SELECTOR, ".expand-view-list-item, [class*='list-item'], .sku-item")
@@ -218,22 +197,24 @@ class Scraper1688:
                         try:
                             label = item_el.find_element(By.CLASS_NAME, "item-label").text.strip()
                             price = item_el.find_element(By.CLASS_NAME, "item-price-stock").text.strip()
-                            if label: variant_data_list.append({"label": label, "price": re.sub(r'[^\d.]', '', price)})
+                            if label:
+                                variant_data_list.append({"label": label, "price": re.sub(r'[^\d.]', '', price)})
                         except: continue
             except: pass
 
             if variant_data_list:
                 results = []
                 for vd in variant_data_list:
-                    row = base_data.copy(); row["color"] = vd["label"]; row["price"] = vd["price"]; results.append(row)
+                    row = base_data.copy()
+                    row["color"] = vd["label"]
+                    row["price"] = vd["price"]
+                    results.append(row)
                 return results
             return [base_data]
         except: return None
 
     def _extract_all_methods(self):
-        """ 强化版:全力探测 1688 列表页数据 (对标 req.py) """
         results = []
-        # 1. 深度内存变量扫描
         scripts = [
             "return JSON.stringify(window.data || window.context?.result?.data || window.__INITIAL_DATA__)",
             "return JSON.stringify(window.context?.result?.global?.globalData?.data || null)"
@@ -255,8 +236,7 @@ class Scraper1688:
                         if link: results.append({"name": str(o.get('title', o.get('subject', ''))), "link": link})
                     if results: return results
             except: continue
-        # 2. 暴力 DOM 选择器保底
-        for s in [".sm-offer-item", ".offer-card-item", ".pc-search-offer-item", "[class*='offer-card']", ".offer-item"]:
+        for s in [".sm-offer-item", ".offer-card-item", "[class*='offer-card']", ".offer-item"]:
             for el in self.driver.find_elements(By.CSS_SELECTOR, s):
                 try:
                     a = el.find_element(By.TAG_NAME, "a")