|
|
@@ -103,15 +103,15 @@ class Scraper1688:
|
|
|
try:
|
|
|
self._cleanup()
|
|
|
time.sleep(2)
|
|
|
- # 兜底方案
|
|
|
- self.driver = uc.Chrome(options=create_options(), headless=headless, use_subprocess=True)
|
|
|
+ # 兜底方案:不使用 subprocess
|
|
|
+ self.driver = uc.Chrome(options=create_options(), headless=headless)
|
|
|
print("[+] 自动兼容模式启动成功!")
|
|
|
except Exception as e2:
|
|
|
print(f"[致命错误] 无法启动 Chrome: {e2}")
|
|
|
raise Exception("无法拉起 Chrome,请尝试关闭杀毒软件或重新安装 Chrome。")
|
|
|
|
|
|
def clean_url(self, url):
|
|
|
- """ 【核心订正】极其鲁棒的 ID 提取逻辑,强制转化为详情页链接,过滤店铺页 """
|
|
|
+ """ 极其鲁棒的 ID 提取逻辑,强制转化为详情页链接,过滤店铺页 """
|
|
|
if not url: return ""
|
|
|
if url.startswith("//"): url = "https:" + url
|
|
|
|
|
|
@@ -163,9 +163,8 @@ class Scraper1688:
|
|
|
print(f"[*] 正在处理列表页: 第 {page} 页...")
|
|
|
self.driver.get(f"{base_url}&beginPage={page}&page={page}")
|
|
|
self.check_for_captcha()
|
|
|
- # 增强滚动
|
|
|
- for i in range(1, 6):
|
|
|
- self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/5});")
|
|
|
+ for i in range(1, 5):
|
|
|
+ self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/4});")
|
|
|
time.sleep(1.5)
|
|
|
|
|
|
page_results = self._extract_all_methods()
|
|
|
@@ -222,15 +221,19 @@ class Scraper1688:
|
|
|
except: pass
|
|
|
return ""
|
|
|
|
|
|
+ def safe_text(by, sel):
|
|
|
+ try: return self.driver.find_element(by, sel).text.strip()
|
|
|
+ except: return ""
|
|
|
+
|
|
|
trade = model.get("tradeModel", {}) if isinstance(model, dict) else {}
|
|
|
ranges = trade.get("disPriceRanges") or trade.get("currentPrices") or []
|
|
|
range_text = " / ".join([f"{r.get('beginAmount')}起 ¥{r.get('price') or r.get('discountPrice')}" for r in ranges])
|
|
|
|
|
|
base_data = {
|
|
|
- "category": (model.get("offerDetail", {}).get("leafCategoryName", "") if isinstance(model, dict) else "") or self.driver.find_element(By.CSS_SELECTOR, "div[class*=breadcrumb] a:last-child").text.strip(),
|
|
|
+ "category": (model.get("offerDetail", {}).get("leafCategoryName", "") if isinstance(model, dict) else "") or safe_text(By.CSS_SELECTOR, "div[class*=breadcrumb] a:last-child"),
|
|
|
"brand": get_attr("品牌"),
|
|
|
"name": (model.get("offerDetail", {}).get("subject", "") if isinstance(model, dict) else "") or self.driver.title.split('-')[0],
|
|
|
- "spec": get_attr("尺码") or get_attr("规格") or get_attr("型号"),
|
|
|
+ "spec": get_attr("尺码") or get_attr("规格") or get_attr("型号") or safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='尺码' or span='规格']/following-sibling::td[1]//span[@class='field-value']"),
|
|
|
"material": get_attr("材质") or get_attr("面料"),
|
|
|
"price": "",
|
|
|
"moq": trade.get("beginAmount", ""),
|
|
|
@@ -282,13 +285,13 @@ class Scraper1688:
|
|
|
results.append(row)
|
|
|
return results
|
|
|
|
|
|
+ base_data["price"] = trade.get("minPrice", "")
|
|
|
return [base_data]
|
|
|
except: return None
|
|
|
|
|
|
def _extract_all_methods(self):
|
|
|
- """ 强化版列表页提取:支持最新 1688 选择器 """
|
|
|
+ """ 列表页提取 """
|
|
|
results = []
|
|
|
- # 1. JSON 提取
|
|
|
try:
|
|
|
res = self.driver.execute_script("return JSON.stringify(window.data || window.__INITIAL_DATA__)")
|
|
|
if res:
|
|
|
@@ -305,14 +308,13 @@ class Scraper1688:
|
|
|
if link and "similar_search" not in link:
|
|
|
results.append({"name": str(o.get('title', '')), "link": link})
|
|
|
except: pass
|
|
|
- # 2. 强力 DOM 扫描
|
|
|
if not results:
|
|
|
- selectors = [".sm-offer-item", ".offer-card-item", ".search-offer-item", "[class*='offer-card']", ".offer-item"]
|
|
|
- for s in selectors:
|
|
|
+ # 引入最新版 1688 选择器,确保能抓到详情链接
|
|
|
+ for s in [".sm-offer-item", ".offer-card-item", ".search-offer-item", "[class*='offer-card']", ".offer-item"]:
|
|
|
for el in self.driver.find_elements(By.CSS_SELECTOR, s):
|
|
|
try:
|
|
|
link = el.find_element(By.TAG_NAME, "a").get_attribute("href")
|
|
|
- if link and "similar_search" not in link:
|
|
|
+ if link and ("offer" in link or "item" in link) and "similar_search" not in link:
|
|
|
results.append({"name": el.text.split('\n')[0][:50], "link": link})
|
|
|
except: continue
|
|
|
if results: break
|