|
|
@@ -116,8 +116,8 @@ class Scraper1688:
|
|
|
if url.startswith("//"): url = "https:" + url
|
|
|
|
|
|
# 1. 尝试从各种路径模式中提取纯数字商品 ID
|
|
|
- id_match = re.search(r'offer/(\d+)\.html', url) or \
|
|
|
- re.search(r'[?&](?:offerId|id)=(\d+)', url) or \
|
|
|
+ id_match = re.search(r'offer(?:Id|Ids)?/(\d+)\.html', url) or \
|
|
|
+ re.search(r'[?&](?:offerId|offerIds|id)=(\d+)', url) or \
|
|
|
re.search(r'object_id@(\d+)', url)
|
|
|
|
|
|
if id_match:
|
|
|
@@ -163,11 +163,14 @@ class Scraper1688:
|
|
|
print(f"[*] 正在处理列表页: 第 {page} 页...")
|
|
|
self.driver.get(f"{base_url}&beginPage={page}&page={page}")
|
|
|
self.check_for_captcha()
|
|
|
- for i in range(1, 5):
|
|
|
- self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/4});")
|
|
|
+ # 增强滚动
|
|
|
+ for i in range(1, 6):
|
|
|
+ self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/5});")
|
|
|
time.sleep(1.5)
|
|
|
|
|
|
page_results = self._extract_all_methods()
|
|
|
+ print(f" [+] 本页发现 {len(page_results)} 个商品链接")
|
|
|
+
|
|
|
page_batch = []
|
|
|
for it in page_results:
|
|
|
clean_url = self.clean_url(it["link"])
|
|
|
@@ -198,7 +201,7 @@ class Scraper1688:
|
|
|
return list(all_links)
|
|
|
|
|
|
def scrape_detail(self, url):
|
|
|
- """ 精准拆分款式与价格 """
|
|
|
+ """ 极其精准的详情页解析:获取 expand-view-list-wrapper 中的款式描述 + 逐条价格 """
|
|
|
try:
|
|
|
self.driver.get(url)
|
|
|
time.sleep(random.uniform(5, 10))
|
|
|
@@ -238,14 +241,21 @@ class Scraper1688:
|
|
|
|
|
|
variant_data_list = []
|
|
|
try:
|
|
|
+ # 1. 核心需求:从 expand-view-list-wrapper 中提取文字和价格
|
|
|
wrappers = self.driver.find_elements(By.CLASS_NAME, "expand-view-list-wrapper")
|
|
|
if wrappers:
|
|
|
+ # 寻找容器下的条目
|
|
|
items = wrappers[0].find_elements(By.CSS_SELECTOR, ".expand-view-list-item, [class*='list-item'], .sku-item")
|
|
|
for item_el in items:
|
|
|
try:
|
|
|
+ # 款式描述文字 (item-label)
|
|
|
label = item_el.find_element(By.CLASS_NAME, "item-label").text.strip()
|
|
|
- price = item_el.find_element(By.CLASS_NAME, "item-price-stock").text.strip()
|
|
|
- if label: variant_data_list.append({"label": label, "price": re.sub(r'[^\d.]', '', price)})
|
|
|
+ # 逐条价格 (item-price-stock)
|
|
|
+ price_raw = item_el.find_element(By.CLASS_NAME, "item-price-stock").text.strip()
|
|
|
+ # 清洗价格,只保留数字
|
|
|
+ price_clean = re.sub(r'[^\d.]', '', price_raw)
|
|
|
+ if label:
|
|
|
+ variant_data_list.append({"label": label, "price": price_clean})
|
|
|
except: continue
|
|
|
except: pass
|
|
|
|
|
|
@@ -257,12 +267,28 @@ class Scraper1688:
|
|
|
row["price"] = vd["price"]
|
|
|
results.append(row)
|
|
|
return results
|
|
|
+
|
|
|
+ # 2. 方案 B: 如果 DOM 探测失败,回退到 JS 模型
|
|
|
+ sku_props = model.get("skuModel", {}).get("skuProps", []) or model.get("detailData", {}).get("skuProps", []) or []
|
|
|
+ main_prop = next((p for p in sku_props if any(k in p.get("prop", "") for k in ["颜色", "分类", "款式", "花色", "净含量"])), None)
|
|
|
+ if not main_prop and sku_props: main_prop = sku_props[0]
|
|
|
+ if main_prop and main_prop.get("value"):
|
|
|
+ results = []
|
|
|
+ for val in main_prop["value"]:
|
|
|
+ if val.get("name"):
|
|
|
+ row = base_data.copy()
|
|
|
+ row["color"] = val.get("name")
|
|
|
+ row["price"] = trade.get("minPrice", "")
|
|
|
+ results.append(row)
|
|
|
+ return results
|
|
|
+
|
|
|
return [base_data]
|
|
|
except: return None
|
|
|
|
|
|
def _extract_all_methods(self):
|
|
|
- """ 列表页提取 """
|
|
|
+ """ 强化版列表页提取:支持最新 1688 选择器 """
|
|
|
results = []
|
|
|
+ # 1. JSON 提取
|
|
|
try:
|
|
|
res = self.driver.execute_script("return JSON.stringify(window.data || window.__INITIAL_DATA__)")
|
|
|
if res:
|
|
|
@@ -276,17 +302,17 @@ class Scraper1688:
|
|
|
return None
|
|
|
for o in (find_list(data) or []):
|
|
|
link = o.get('itemUrl', o.get('url', ''))
|
|
|
- # 过滤干扰链接
|
|
|
if link and "similar_search" not in link:
|
|
|
results.append({"name": str(o.get('title', '')), "link": link})
|
|
|
except: pass
|
|
|
+ # 2. 强力 DOM 扫描
|
|
|
if not results:
|
|
|
- for s in [".search-offer-item", "[class*='offer-card']", ".offer-item"]:
|
|
|
+ selectors = [".sm-offer-item", ".offer-card-item", ".search-offer-item", "[class*='offer-card']", ".offer-item"]
|
|
|
+ for s in selectors:
|
|
|
for el in self.driver.find_elements(By.CSS_SELECTOR, s):
|
|
|
try:
|
|
|
link = el.find_element(By.TAG_NAME, "a").get_attribute("href")
|
|
|
- # 只有包含详情特征的链接才提取
|
|
|
- if link and ("offer" in link or "item" in link) and "similar_search" not in link:
|
|
|
+ if link and "similar_search" not in link:
|
|
|
results.append({"name": el.text.split('\n')[0][:50], "link": link})
|
|
|
except: continue
|
|
|
if results: break
|