LuTong 2 maanden geleden
bovenliggende
commit
15eab1f1b7
1 gewijzigde bestanden met toevoegingen van 38 en 12 verwijderingen
  1. 38 12
      src/scraper.py

+ 38 - 12
src/scraper.py

@@ -116,8 +116,8 @@ class Scraper1688:
         if url.startswith("//"): url = "https:" + url
         
         # 1. 尝试从各种路径模式中提取纯数字商品 ID
-        id_match = re.search(r'offer/(\d+)\.html', url) or \
-                   re.search(r'[?&](?:offerId|id)=(\d+)', url) or \
+        id_match = re.search(r'offer(?:Id|Ids)?/(\d+)\.html', url) or \
+                   re.search(r'[?&](?:offerId|offerIds|id)=(\d+)', url) or \
                    re.search(r'object_id@(\d+)', url)
         
         if id_match:
@@ -163,11 +163,14 @@ class Scraper1688:
             print(f"[*] 正在处理列表页: 第 {page} 页...")
             self.driver.get(f"{base_url}&beginPage={page}&page={page}")
             self.check_for_captcha()
-            for i in range(1, 5):
-                self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/4});")
+            # 增强滚动
+            for i in range(1, 6):
+                self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/5});")
                 time.sleep(1.5)
 
             page_results = self._extract_all_methods()
+            print(f"  [+] 本页发现 {len(page_results)} 个商品链接")
+            
             page_batch = []
             for it in page_results:
                 clean_url = self.clean_url(it["link"])
@@ -198,7 +201,7 @@ class Scraper1688:
         return list(all_links)
 
     def scrape_detail(self, url):
-        """ 精准拆分款式与价格 """
+        """ 极其精准的详情页解析:获取 expand-view-list-wrapper 中的款式描述 + 逐条价格 """
         try:
             self.driver.get(url)
             time.sleep(random.uniform(5, 10))
@@ -238,14 +241,21 @@ class Scraper1688:
 
             variant_data_list = []
             try:
+                # 1. 核心需求:从 expand-view-list-wrapper 中提取文字和价格
                 wrappers = self.driver.find_elements(By.CLASS_NAME, "expand-view-list-wrapper")
                 if wrappers:
+                    # 寻找容器下的条目
                     items = wrappers[0].find_elements(By.CSS_SELECTOR, ".expand-view-list-item, [class*='list-item'], .sku-item")
                     for item_el in items:
                         try:
+                            # 款式描述文字 (item-label)
                             label = item_el.find_element(By.CLASS_NAME, "item-label").text.strip()
-                            price = item_el.find_element(By.CLASS_NAME, "item-price-stock").text.strip()
-                            if label: variant_data_list.append({"label": label, "price": re.sub(r'[^\d.]', '', price)})
+                            # 逐条价格 (item-price-stock)
+                            price_raw = item_el.find_element(By.CLASS_NAME, "item-price-stock").text.strip()
+                            # 清洗价格,只保留数字
+                            price_clean = re.sub(r'[^\d.]', '', price_raw)
+                            if label:
+                                variant_data_list.append({"label": label, "price": price_clean})
                         except: continue
             except: pass
 
@@ -257,12 +267,28 @@ class Scraper1688:
                     row["price"] = vd["price"]
                     results.append(row)
                 return results
+            
+            # 2. 方案 B: 如果 DOM 探测失败,回退到 JS 模型
+            sku_props = model.get("skuModel", {}).get("skuProps", []) or model.get("detailData", {}).get("skuProps", []) or []
+            main_prop = next((p for p in sku_props if any(k in p.get("prop", "") for k in ["颜色", "分类", "款式", "花色", "净含量"])), None)
+            if not main_prop and sku_props: main_prop = sku_props[0]
+            if main_prop and main_prop.get("value"):
+                results = []
+                for val in main_prop["value"]:
+                    if val.get("name"):
+                        row = base_data.copy()
+                        row["color"] = val.get("name")
+                        row["price"] = trade.get("minPrice", "")
+                        results.append(row)
+                return results
+            
             return [base_data]
         except: return None
 
     def _extract_all_methods(self):
-        """ 列表页提取 """
+        """ 强化版列表页提取:支持最新 1688 选择器 """
         results = []
+        # 1. JSON 提取
         try:
             res = self.driver.execute_script("return JSON.stringify(window.data || window.__INITIAL_DATA__)")
             if res:
@@ -276,17 +302,17 @@ class Scraper1688:
                     return None
                 for o in (find_list(data) or []):
                     link = o.get('itemUrl', o.get('url', ''))
-                    # 过滤干扰链接
                     if link and "similar_search" not in link:
                         results.append({"name": str(o.get('title', '')), "link": link})
         except: pass
+        # 2. 强力 DOM 扫描
         if not results:
-            for s in [".search-offer-item", "[class*='offer-card']", ".offer-item"]:
+            selectors = [".sm-offer-item", ".offer-card-item", ".search-offer-item", "[class*='offer-card']", ".offer-item"]
+            for s in selectors:
                 for el in self.driver.find_elements(By.CSS_SELECTOR, s):
                     try:
                         link = el.find_element(By.TAG_NAME, "a").get_attribute("href")
-                        # 只有包含详情特征的链接才提取
-                        if link and ("offer" in link or "item" in link) and "similar_search" not in link:
+                        if link and "similar_search" not in link:
                             results.append({"name": el.text.split('\n')[0][:50], "link": link})
                     except: continue
                 if results: break