LuTong hace 2 meses
padre
commit
b86a8ba0e7
Se han modificado 1 ficheros con 75 adiciones y 32 borrados
  1. 75 32
      src/scraper.py

+ 75 - 32
src/scraper.py

@@ -125,17 +125,20 @@ class Scraper1688:
             self.driver.get(f"{base_url}&beginPage={page}&page={page}")
             self.check_for_captcha()
             
-            # --- 强化:分段滚动激活懒加载 ---
-            for i in range(1, 11):
-                self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/10});")
-                time.sleep(random.uniform(1.5, 3.0))
-                if i == 5:
-                    self.driver.execute_script("window.scrollBy(0, -300);")
+            # --- 强化:模拟真实人类分段滚动,触发懒加载 ---
+            for i in range(1, 13):
+                # 分段滑动
+                self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/12});")
+                time.sleep(random.uniform(1.2, 2.8))
+                # 关键:每隔几步向上“回弹”一下,这种非规律动作最能触发 1688 的加载钩子
+                if i % 4 == 0:
+                    self.driver.execute_script(f"window.scrollBy(0, -{random.randint(200, 500)});")
                     time.sleep(1.0)
-            time.sleep(3)
+            
+            time.sleep(random.uniform(3, 6)) # 最终等待数据同步到变量
 
             page_results = self._extract_all_methods()
-            print(f"  [+] 本页解析完成:共发现 {len(page_results)} 个商品链接")
+            print(f"  [+] 本页解析完成:共发现 {len(page_results)} 个潜在商品链接")
             
             page_batch = []
             for it in page_results:
@@ -256,32 +259,72 @@ class Scraper1688:
         except: return None
 
     def _extract_all_methods(self):
-        """ 列表页提取 """
-        results = []
-        try:
-            res = self.driver.execute_script("return JSON.stringify(window.data || window.context?.result?.data || window.__INITIAL_DATA__)")
-            if res and res != "null":
-                data = json.loads(res)
-                def find_list(obj):
-                    if isinstance(obj, list) and len(obj) > 0 and isinstance(obj[0], dict) and any(k in obj[0] for k in ['offerId', 'title']): return obj
-                    if isinstance(obj, dict):
-                        for k in obj:
-                            f = find_list(obj[k])
-                            if f: return f
-                    return None
-                for o in (find_list(data) or []):
-                    link = o.get('itemUrl', o.get('url', '')) or str(o.get('offerId', ''))
-                    if link: results.append({"name": str(o.get('title', '')), "link": link})
-        except: pass
-        if not results:
-            for s in [".sm-offer-item", ".offer-card-item", "[class*='offer-card']", ".offer-item"]:
-                for el in self.driver.find_elements(By.CSS_SELECTOR, s):
+        """ 强化版:从所有可能的内存变量中收集商品列表,并去重 """
+        all_items = []
+        seen_ids = set()
+
+        def add_item(name, link):
+            if not link: return
+            # 统一转化为标准详情页链接并提取 ID 作为去重键
+            url_str = str(link)
+            id_match = re.search(r'(\d{9,15})', url_str)
+            if id_match:
+                oid = id_match.group(1)
+                if oid not in seen_ids:
+                    seen_ids.add(oid)
+                    standard_url = f"https://detail.1688.com/offer/{oid}.html"
+                    all_items.append({"name": name, "link": standard_url})
+
+        # 1. 深度内存探测 (对标 req.py 逻辑)
+        scripts = [
+            "return JSON.stringify(window.data)",
+            "return JSON.stringify(window.context?.result?.data)",
+            "return JSON.stringify(window.__INITIAL_DATA__)",
+            "return JSON.stringify(window.pageData)"
+        ]
+        
+        for s in scripts:
+            try:
+                res = self.driver.execute_script(s)
+                if res and res != "null":
+                    data = json.loads(res)
+                    
+                    # 递归寻找所有符合商品列表特征的 list
+                    def collect_lists(obj):
+                        found = []
+                        if isinstance(obj, list) and len(obj) > 0:
+                            # 只要列表第一项包含 offerId 或 title,就认为是目标列表
+                            if isinstance(obj[0], dict) and any(k in obj[0] for k in ['offerId', 'title', 'subject']):
+                                found.append(obj)
+                        elif isinstance(obj, dict):
+                            for v in obj.values():
+                                found.extend(collect_lists(v))
+                        return found
+
+                    all_found_lists = collect_lists(data)
+                    for plist in all_found_lists:
+                        for o in plist:
+                            name = str(o.get('title', o.get('subject', o.get('name', ''))))
+                            link = o.get('itemUrl', o.get('url', '')) or str(o.get('offerId', ''))
+                            add_item(name, link)
+            except: continue
+
+        # 2. DOM 暴力补位 (如果内存变量探测到的不足 10 个,说明可能渲染机制变了)
+        if len(all_items) < 10:
+            selectors = [".sm-offer-item", ".offer-card-item", "[class*='offer-card']", ".offer-item", ".major-offer"]
+            for s in selectors:
+                elements = self.driver.find_elements(By.CSS_SELECTOR, s)
+                for el in elements:
                     try:
-                        link = el.find_element(By.TAG_NAME, "a").get_attribute("href")
-                        if link: results.append({"name": el.text.split('\n')[0][:50], "link": link})
+                        # 尝试寻找 a 标签
+                        a_tags = el.find_elements(By.TAG_NAME, "a")
+                        for a in a_tags:
+                            link = a.get_attribute("href")
+                            name = el.text.split('\n')[0][:50]
+                            add_item(name, link)
                     except: continue
-                if results: break
-        return results
+        
+        return all_items
 
     def quit(self):
         try: self.driver.quit()