|
|
@@ -1,4 +1,5 @@
|
|
|
-# 【版本:2026-01-16 13:45 - 变体与价格精准同步版】
|
|
|
+# 【版本:2026-01-16 14:00 - 变体精准解析终极版】
|
|
|
+# 针对 Python 3.12+ 移除 distutils 的兼容性补丁
|
|
|
import sys
|
|
|
try:
|
|
|
import distutils
|
|
|
@@ -64,6 +65,7 @@ class Scraper1688:
|
|
|
def _init_chrome(self, headless):
|
|
|
""" 强化版 Chrome 启动逻辑 """
|
|
|
chrome_path = self._find_chrome()
|
|
|
+
|
|
|
def create_options():
|
|
|
opts = uc.ChromeOptions()
|
|
|
opts.add_argument(f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')
|
|
|
@@ -75,14 +77,16 @@ class Scraper1688:
|
|
|
opts.add_argument("--disable-dev-shm-usage")
|
|
|
opts.add_argument("--remote-allow-origins=*")
|
|
|
return opts
|
|
|
+
|
|
|
try:
|
|
|
+ # 优先使用 subprocess 模式启动,解决 Win11 连接难题
|
|
|
self.driver = uc.Chrome(options=create_options(), headless=headless, browser_executable_path=chrome_path, use_subprocess=True)
|
|
|
- except Exception:
|
|
|
+ except:
|
|
|
# 失败则尝试普通模式
|
|
|
self.driver = uc.Chrome(options=create_options(), headless=headless, use_subprocess=True)
|
|
|
|
|
|
def clean_url(self, url):
|
|
|
- """ 鲁棒的 ID 提取并重组链接 """
|
|
|
+ """ 鲁棒的 ID 提取 logic """
|
|
|
if not url: return ""
|
|
|
url_str = str(url)
|
|
|
if url_str.startswith("//"): url_str = "https:" + url_str
|
|
|
@@ -103,8 +107,7 @@ class Scraper1688:
|
|
|
if self.status_callback: self.status_callback(True, msg)
|
|
|
while is_blocked(): time.sleep(2)
|
|
|
if self.status_callback: self.status_callback(False, "验证通过")
|
|
|
- if self.log_callback: self.log_callback(
|
|
|
- "<font color='orange'>验证成功,进入 120 秒冷却期重置行为指纹...</font>")
|
|
|
+ # 验证成功后强制冷却,防止二次封禁
|
|
|
time.sleep(random.randint(60, 120))
|
|
|
return True
|
|
|
|
|
|
@@ -117,25 +120,24 @@ class Scraper1688:
|
|
|
page, initial_count = 1, len(all_links)
|
|
|
|
|
|
while len(all_links) < total_count + initial_count:
|
|
|
- page_anotation = f"[∫] 列表页采集: 第 {page} 页...";
|
|
|
- print(page_anotation)
|
|
|
- if self.log_callback: self.log_callback(page_anotation)
|
|
|
+ print(f"[*] 列表页采集: 第 {page} 页...")
|
|
|
self.driver.get(f"{base_url}&beginPage={page}&page={page}")
|
|
|
self.check_for_captcha()
|
|
|
|
|
|
- # --- 强化:模拟真实人类分段滚动,触发懒加载 ---
|
|
|
+ # --- 强化:分段滚动激活懒加载,解决第一页只解析到一个的问题 ---
|
|
|
for i in range(1, 13):
|
|
|
# 分段滑动
|
|
|
self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/12});")
|
|
|
- time.sleep(random.uniform(1.2, 2.8))
|
|
|
- # 关键:每隔几步向上“回弹”一下,这种非规律动作最能触发 1688 的加载钩子
|
|
|
+ time.sleep(random.uniform(1.5, 3.0))
|
|
|
+ # 关键:向上“回弹”动作触发 1688 加载钩子
|
|
|
if i % 4 == 0:
|
|
|
self.driver.execute_script(f"window.scrollBy(0, -{random.randint(200, 500)});")
|
|
|
time.sleep(1.0)
|
|
|
- time.sleep(random.uniform(3, 6))
|
|
|
+
|
|
|
+ time.sleep(random.uniform(3, 6)) # 最终等待数据渲染
|
|
|
|
|
|
page_results = self._extract_all_methods()
|
|
|
- print(f" [+] 本页解析完成:共发现 {len(page_results)} 个(潜在)商品链接")
|
|
|
+ print(f" [+] 本页解析完成:共发现 {len(page_results)} 个商品链接")
|
|
|
|
|
|
page_batch = []
|
|
|
for it in page_results:
|
|
|
@@ -143,7 +145,7 @@ class Scraper1688:
|
|
|
if clean_url and clean_url not in all_links:
|
|
|
all_links.add(clean_url)
|
|
|
|
|
|
- # 冷却机制
|
|
|
+ # 保护机制
|
|
|
new_count = len(all_links) - initial_count
|
|
|
if new_count > 0 and new_count % 12 == 0:
|
|
|
rest_secs = random.randint(300, 600)
|
|
|
@@ -161,8 +163,7 @@ class Scraper1688:
|
|
|
yield page_batch
|
|
|
page_batch = []
|
|
|
|
|
|
- # 详情页后的随机等待
|
|
|
- time.sleep(random.uniform(40, 80))
|
|
|
+ time.sleep(random.uniform(40, 80))
|
|
|
if len(all_links) >= total_count + initial_count: break
|
|
|
|
|
|
if page_batch: yield page_batch
|
|
|
@@ -172,7 +173,7 @@ class Scraper1688:
|
|
|
return list(all_links)
|
|
|
|
|
|
def scrape_detail(self, url):
|
|
|
- """ 极精准变体解析:锁定 expand-view-list 并提取款式与逐条价格 """
|
|
|
+ """ 极精准变体解析:锁定 expand-view-list 区域并拆分价格与描述 """
|
|
|
try:
|
|
|
self.driver.get(url)
|
|
|
# 仿真阅读
|
|
|
@@ -202,66 +203,70 @@ class Scraper1688:
|
|
|
"category": (model.get("offerDetail", {}).get("leafCategoryName", "") if isinstance(model, dict) else "") or self.driver.find_element(By.CSS_SELECTOR, "div[class*=breadcrumb] a:last-child").text.strip(),
|
|
|
"brand": get_attr("品牌"),
|
|
|
"name": (model.get("offerDetail", {}).get("subject", "") if isinstance(model, dict) else "") or self.driver.title.split('-')[0],
|
|
|
- "spec": get_attr("尺码") or get_attr("规格") or get_attr("型号"),
|
|
|
+ "spec": "", # 待填充
|
|
|
+ "color": "", # 待填充
|
|
|
"material": get_attr("材质") or get_attr("面料"),
|
|
|
+ "price": "", # 待填充
|
|
|
"moq": trade.get("beginAmount", ""),
|
|
|
"wholesale_price": range_text,
|
|
|
"link": url,
|
|
|
"supplier": (model.get("sellerModel", {}).get("companyName", "") if isinstance(model, dict) else ""),
|
|
|
}
|
|
|
|
|
|
- # --- 核心逻辑订正:精准识别变体区域并拆分多行数据 ---
|
|
|
variant_results = []
|
|
|
try:
|
|
|
- # 按照用户提供的线索,锁定核心容器
|
|
|
+ # --- 关键订正:基于用户发现的 expand-view-list 锁定变体区域 ---
|
|
|
# 兼容 expand-view-list 和 expand-view-list-wrapper
|
|
|
wrappers = self.driver.find_elements(By.CSS_SELECTOR, ".expand-view-list, .expand-view-list-wrapper")
|
|
|
if wrappers:
|
|
|
- # 1. 尝试直接获取所有 label 和 price 的对
|
|
|
- labels = wrappers[0].find_elements(By.CLASS_NAME, "item-label")
|
|
|
- prices = wrappers[0].find_elements(By.CLASS_NAME, "item-price-stock")
|
|
|
-
|
|
|
- if labels and prices and len(labels) == len(prices):
|
|
|
- for i in range(len(labels)):
|
|
|
- l_text = labels[i].text.strip()
|
|
|
- p_text = prices[i].text.strip()
|
|
|
+ # 寻找每一个变体子项条目
|
|
|
+ items = wrappers[0].find_elements(By.CSS_SELECTOR, ".expand-view-list-item, [class*='list-item'], .sku-item")
|
|
|
+ for item_el in items:
|
|
|
+ try:
|
|
|
+ # 1. 描述文字文字 (item-label) -> 对应 Excel “规格尺码”和“颜色”列
|
|
|
+ label_el = item_el.find_element(By.CLASS_NAME, "item-label")
|
|
|
+ label_text = label_el.text.strip()
|
|
|
+
|
|
|
+ # 2. 逐条对应的价格 (item-price-stock) -> 对应 Excel “单品进价(元)”列
|
|
|
+ price_el = item_el.find_element(By.CLASS_NAME, "item-price-stock")
|
|
|
+ price_raw = price_el.text.strip()
|
|
|
# 价格清洗:只保留数字和小数点
|
|
|
- p_clean = re.sub(r'[^\d.]', '', p_text)
|
|
|
- if l_text:
|
|
|
+ price_clean = re.sub(r'[^\d.]', '', price_raw)
|
|
|
+
|
|
|
+ if label_text:
|
|
|
row = base_data.copy()
|
|
|
- row["color"] = l_text # 款式描述 -> 颜色列
|
|
|
- row["price"] = p_clean if p_clean else p_text # 逐条价格 -> 单品进价列
|
|
|
- if not row["spec"]: row["spec"] = l_text # 规格也同步填充
|
|
|
+ # 按照用户要求进行映射
|
|
|
+ row["spec"] = label_text # 描述文字填入“规格尺码”
|
|
|
+ row["color"] = label_text # 同步填入“颜色”
|
|
|
+ row["price"] = price_clean # 价格数字填入“单品进价(元)”
|
|
|
variant_results.append(row)
|
|
|
-
|
|
|
- # 2. 如果数量对不上,尝试按照子容器逐项提取
|
|
|
- if not variant_results:
|
|
|
- items = wrappers[0].find_elements(By.CSS_SELECTOR, ".expand-view-list-item, [class*='list-item'], .sku-item")
|
|
|
- for item_el in items:
|
|
|
- try:
|
|
|
- l_el = item_el.find_element(By.CLASS_NAME, "item-label")
|
|
|
- p_el = item_el.find_element(By.CLASS_NAME, "item-price-stock")
|
|
|
- if l_el and p_el:
|
|
|
- l_text = l_el.text.strip()
|
|
|
- p_text = p_el.text.strip()
|
|
|
- p_clean = re.sub(r'[^\d.]', '', p_text)
|
|
|
- if l_text:
|
|
|
- row = base_data.copy()
|
|
|
- row["color"] = l_text
|
|
|
- row["price"] = p_clean if p_clean else p_text
|
|
|
- if not row["spec"]: row["spec"] = l_text
|
|
|
- variant_results.append(row)
|
|
|
- except: continue
|
|
|
- except Exception as e:
|
|
|
- print(f" [!] 变体区域解析异常: {e}")
|
|
|
+ except: continue
|
|
|
+ except: pass
|
|
|
|
|
|
if variant_results:
|
|
|
- print(f" [+] 成功解析到 {len(variant_results)} 个款式变体")
|
|
|
return variant_results
|
|
|
+
|
|
|
+ # 方案 B: 回退到模型提取
|
|
|
+ sku_props = model.get("skuModel", {}).get("skuProps", []) or model.get("detailData", {}).get("skuProps", []) or []
|
|
|
+ main_prop = next((p for p in sku_props if any(k in p.get("prop", "") for k in ["颜色", "分类", "款式", "花色", "净含量"])), None)
|
|
|
+ if not main_prop and sku_props: main_prop = sku_props[0]
|
|
|
+ if main_prop and main_prop.get("value"):
|
|
|
+ results = []
|
|
|
+ for val in main_prop["value"]:
|
|
|
+ if val.get("name"):
|
|
|
+ row = base_data.copy()
|
|
|
+ row["color"] = val.get("name")
|
|
|
+ row["spec"] = val.get("name")
|
|
|
+ row["price"] = trade.get("minPrice", "")
|
|
|
+ results.append(row)
|
|
|
+ return results
|
|
|
+
|
|
|
+ base_data["price"] = trade.get("minPrice", "")
|
|
|
return [base_data]
|
|
|
except: return None
|
|
|
|
|
|
def _extract_all_methods(self):
|
|
|
+ """ 强化版探测:从内存变量中抓取列表 """
|
|
|
results = []
|
|
|
seen_ids = set()
|
|
|
def add_item(name, link):
|