|
|
@@ -125,17 +125,20 @@ class Scraper1688:
|
|
|
self.driver.get(f"{base_url}&beginPage={page}&page={page}")
|
|
|
self.check_for_captcha()
|
|
|
|
|
|
- # --- 强化:分段滚动激活懒加载 ---
|
|
|
- for i in range(1, 11):
|
|
|
- self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/10});")
|
|
|
- time.sleep(random.uniform(1.5, 3.0))
|
|
|
- if i == 5:
|
|
|
- self.driver.execute_script("window.scrollBy(0, -300);")
|
|
|
+ # --- 强化:模拟真实人类分段滚动,触发懒加载 ---
|
|
|
+ for i in range(1, 13):
|
|
|
+ # 分段滑动
|
|
|
+ self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/12});")
|
|
|
+ time.sleep(random.uniform(1.2, 2.8))
|
|
|
+ # 关键:每隔几步向上“回弹”一下,这种非规律动作最能触发 1688 的加载钩子
|
|
|
+ if i % 4 == 0:
|
|
|
+ self.driver.execute_script(f"window.scrollBy(0, -{random.randint(200, 500)});")
|
|
|
time.sleep(1.0)
|
|
|
- time.sleep(3)
|
|
|
+
|
|
|
+ time.sleep(random.uniform(3, 6)) # 最终等待数据同步到变量
|
|
|
|
|
|
page_results = self._extract_all_methods()
|
|
|
- print(f" [+] 本页解析完成:共发现 {len(page_results)} 个商品链接")
|
|
|
+ print(f" [+] 本页解析完成:共发现 {len(page_results)} 个潜在商品链接")
|
|
|
|
|
|
page_batch = []
|
|
|
for it in page_results:
|
|
|
@@ -256,32 +259,72 @@ class Scraper1688:
|
|
|
except: return None
|
|
|
|
|
|
def _extract_all_methods(self):
|
|
|
- """ 列表页提取 """
|
|
|
- results = []
|
|
|
- try:
|
|
|
- res = self.driver.execute_script("return JSON.stringify(window.data || window.context?.result?.data || window.__INITIAL_DATA__)")
|
|
|
- if res and res != "null":
|
|
|
- data = json.loads(res)
|
|
|
- def find_list(obj):
|
|
|
- if isinstance(obj, list) and len(obj) > 0 and isinstance(obj[0], dict) and any(k in obj[0] for k in ['offerId', 'title']): return obj
|
|
|
- if isinstance(obj, dict):
|
|
|
- for k in obj:
|
|
|
- f = find_list(obj[k])
|
|
|
- if f: return f
|
|
|
- return None
|
|
|
- for o in (find_list(data) or []):
|
|
|
- link = o.get('itemUrl', o.get('url', '')) or str(o.get('offerId', ''))
|
|
|
- if link: results.append({"name": str(o.get('title', '')), "link": link})
|
|
|
- except: pass
|
|
|
- if not results:
|
|
|
- for s in [".sm-offer-item", ".offer-card-item", "[class*='offer-card']", ".offer-item"]:
|
|
|
- for el in self.driver.find_elements(By.CSS_SELECTOR, s):
|
|
|
+ """ 强化版:从所有可能的内存变量中收集商品列表,并去重 """
|
|
|
+ all_items = []
|
|
|
+ seen_ids = set()
|
|
|
+
|
|
|
+ def add_item(name, link):
|
|
|
+ if not link: return
|
|
|
+ # 统一转化为标准详情页链接并提取 ID 作为去重键
|
|
|
+ url_str = str(link)
|
|
|
+ id_match = re.search(r'(\d{9,15})', url_str)
|
|
|
+ if id_match:
|
|
|
+ oid = id_match.group(1)
|
|
|
+ if oid not in seen_ids:
|
|
|
+ seen_ids.add(oid)
|
|
|
+ standard_url = f"https://detail.1688.com/offer/{oid}.html"
|
|
|
+ all_items.append({"name": name, "link": standard_url})
|
|
|
+
|
|
|
+ # 1. 深度内存探测 (对标 req.py 逻辑)
|
|
|
+ scripts = [
|
|
|
+ "return JSON.stringify(window.data)",
|
|
|
+ "return JSON.stringify(window.context?.result?.data)",
|
|
|
+ "return JSON.stringify(window.__INITIAL_DATA__)",
|
|
|
+ "return JSON.stringify(window.pageData)"
|
|
|
+ ]
|
|
|
+
|
|
|
+ for s in scripts:
|
|
|
+ try:
|
|
|
+ res = self.driver.execute_script(s)
|
|
|
+ if res and res != "null":
|
|
|
+ data = json.loads(res)
|
|
|
+
|
|
|
+ # 递归寻找所有符合商品列表特征的 list
|
|
|
+ def collect_lists(obj):
|
|
|
+ found = []
|
|
|
+ if isinstance(obj, list) and len(obj) > 0:
|
|
|
+ # 只要列表第一项包含 offerId 或 title,就认为是目标列表
|
|
|
+ if isinstance(obj[0], dict) and any(k in obj[0] for k in ['offerId', 'title', 'subject']):
|
|
|
+ found.append(obj)
|
|
|
+ elif isinstance(obj, dict):
|
|
|
+ for v in obj.values():
|
|
|
+ found.extend(collect_lists(v))
|
|
|
+ return found
|
|
|
+
|
|
|
+ all_found_lists = collect_lists(data)
|
|
|
+ for plist in all_found_lists:
|
|
|
+ for o in plist:
|
|
|
+ name = str(o.get('title', o.get('subject', o.get('name', ''))))
|
|
|
+ link = o.get('itemUrl', o.get('url', '')) or str(o.get('offerId', ''))
|
|
|
+ add_item(name, link)
|
|
|
+ except: continue
|
|
|
+
|
|
|
+ # 2. DOM 暴力补位 (如果内存变量探测到的不足 10 个,说明可能渲染机制变了)
|
|
|
+ if len(all_items) < 10:
|
|
|
+ selectors = [".sm-offer-item", ".offer-card-item", "[class*='offer-card']", ".offer-item", ".major-offer"]
|
|
|
+ for s in selectors:
|
|
|
+ elements = self.driver.find_elements(By.CSS_SELECTOR, s)
|
|
|
+ for el in elements:
|
|
|
try:
|
|
|
- link = el.find_element(By.TAG_NAME, "a").get_attribute("href")
|
|
|
- if link: results.append({"name": el.text.split('\n')[0][:50], "link": link})
|
|
|
+ # 尝试寻找 a 标签
|
|
|
+ a_tags = el.find_elements(By.TAG_NAME, "a")
|
|
|
+ for a in a_tags:
|
|
|
+ link = a.get_attribute("href")
|
|
|
+ name = el.text.split('\n')[0][:50]
|
|
|
+ add_item(name, link)
|
|
|
except: continue
|
|
|
- if results: break
|
|
|
- return results
|
|
|
+
|
|
|
+ return all_items
|
|
|
|
|
|
def quit(self):
|
|
|
try: self.driver.quit()
|