|
|
@@ -1,4 +1,3 @@
|
|
|
-# 【版本:20260115-终极订正版】
|
|
|
# 针对 Python 3.12+ 移除 distutils 的兼容性补丁
|
|
|
import sys
|
|
|
try:
|
|
|
@@ -34,7 +33,6 @@ class Scraper1688:
|
|
|
stealth(self.driver, languages=["zh-CN", "zh"], vendor="Google Inc.", platform="Win32", fix_hairline=True)
|
|
|
|
|
|
def _find_chrome(self):
|
|
|
- """ 强力锁定 Chrome 安装路径 """
|
|
|
import winreg
|
|
|
reg_paths = [
|
|
|
(winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\chrome.exe"),
|
|
|
@@ -75,21 +73,16 @@ class Scraper1688:
|
|
|
return opts
|
|
|
try:
|
|
|
self.driver = uc.Chrome(options=create_options(), headless=headless, browser_executable_path=chrome_path, use_subprocess=True)
|
|
|
- except Exception as e:
|
|
|
+ except:
|
|
|
self.driver = uc.Chrome(options=create_options(), headless=headless, use_subprocess=True)
|
|
|
|
|
|
def clean_url(self, url):
|
|
|
- """ 【关键订正】极其简化的 ID 提取逻辑,只要是商品就必须进入详情页 """
|
|
|
if not url: return ""
|
|
|
- # 强制转换为字符串并处理
|
|
|
url_str = str(url)
|
|
|
if url_str.startswith("//"): url_str = "https:" + url_str
|
|
|
-
|
|
|
- # 只要能匹配到连续的 9-15 位数字(1688 商品 ID 特征),就重组
|
|
|
id_match = re.search(r'(\d{9,15})', url_str)
|
|
|
if id_match:
|
|
|
- standard_url = f"https://detail.1688.com/offer/{id_match.group(1)}.html"
|
|
|
- return standard_url
|
|
|
+ return f"https://detail.1688.com/offer/{id_match.group(1)}.html"
|
|
|
return ""
|
|
|
|
|
|
def check_for_captcha(self):
|
|
|
@@ -112,7 +105,6 @@ class Scraper1688:
|
|
|
base_url = f"https://s.1688.com/selloffer/offer_search.htm?keywords={gbk_keyword}&n=y&netType=1%2C11%2C16"
|
|
|
self.driver.get("https://www.1688.com")
|
|
|
self.check_for_captcha()
|
|
|
-
|
|
|
all_links = existing_links if existing_links is not None else set()
|
|
|
page, initial_count = 1, len(all_links)
|
|
|
|
|
|
@@ -120,47 +112,38 @@ class Scraper1688:
|
|
|
print(f"[*] 正在处理列表页: 第 {page} 页...")
|
|
|
self.driver.get(f"{base_url}&beginPage={page}&page={page}")
|
|
|
self.check_for_captcha()
|
|
|
- for i in range(1, 5):
|
|
|
- self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/4});")
|
|
|
- time.sleep(1.2)
|
|
|
+ for i in range(1, 9):
|
|
|
+ self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/8});")
|
|
|
+ time.sleep(random.uniform(1.2, 2.5))
|
|
|
+ if i == 4:
|
|
|
+ self.driver.execute_script("window.scrollBy(0, -400);")
|
|
|
+ time.sleep(1.0)
|
|
|
+ time.sleep(5)
|
|
|
|
|
|
- # 获取本页链接 (完全对标 req.py 变量探测)
|
|
|
page_results = self._extract_all_methods()
|
|
|
- print(f" [+] 本页发现 {len(page_results)} 个原始条目")
|
|
|
+ print(f" [+] 本页发现 {len(page_results)} 个商品原始条目")
|
|
|
|
|
|
page_batch = []
|
|
|
for it in page_results:
|
|
|
- raw_link = it.get("link")
|
|
|
- clean_url = self.clean_url(raw_link)
|
|
|
-
|
|
|
- if not clean_url:
|
|
|
- continue
|
|
|
-
|
|
|
- if clean_url in all_links:
|
|
|
- print(f" [-] 跳过已存在商品: {clean_url}")
|
|
|
- continue
|
|
|
-
|
|
|
- all_links.add(clean_url)
|
|
|
- # 【强制日志】只要进入这里,就一定会打印并执行详情抓取
|
|
|
- print(f" [>] 正在执行详情抓取流程: {clean_url}")
|
|
|
-
|
|
|
- detail_results = self.scrape_detail(clean_url)
|
|
|
- if detail_results:
|
|
|
- page_batch.extend(detail_results)
|
|
|
- else:
|
|
|
- # 即使详情失败也记录基本信息,防止死循环
|
|
|
- page_batch.append({
|
|
|
- "category": "", "brand": "", "name": it.get("name", "未知"),
|
|
|
- "color": "", "spec": "", "material": "", "price": "",
|
|
|
- "moq": "", "wholesale_price": "", "link": clean_url, "supplier": ""
|
|
|
- })
|
|
|
-
|
|
|
- if len(page_batch) >= 10:
|
|
|
- yield page_batch
|
|
|
- page_batch = []
|
|
|
-
|
|
|
- time.sleep(random.uniform(15, 25))
|
|
|
- if len(all_links) >= total_count + initial_count: break
|
|
|
+ clean_url = self.clean_url(it.get("link"))
|
|
|
+ if clean_url and clean_url not in all_links:
|
|
|
+ all_links.add(clean_url)
|
|
|
+ print(f" [>] 正在启动详情抓取: {clean_url}")
|
|
|
+ detail_results = self.scrape_detail(clean_url)
|
|
|
+ if detail_results:
|
|
|
+ page_batch.extend(detail_results)
|
|
|
+ else:
|
|
|
+ page_batch.append({
|
|
|
+ "category": "", "brand": "", "name": it.get("name", "未知"),
|
|
|
+ "color": "", "spec": "", "material": "", "price": "",
|
|
|
+ "moq": "", "wholesale_price": "", "link": clean_url, "supplier": ""
|
|
|
+ })
|
|
|
+
|
|
|
+ if len(page_batch) >= 10:
|
|
|
+ yield page_batch
|
|
|
+ page_batch = []
|
|
|
+ time.sleep(random.uniform(15, 25))
|
|
|
+ if len(all_links) >= total_count + initial_count: break
|
|
|
|
|
|
if page_batch: yield page_batch
|
|
|
page += 1
|
|
|
@@ -170,17 +153,13 @@ class Scraper1688:
|
|
|
return list(all_links)
|
|
|
|
|
|
def scrape_detail(self, url):
|
|
|
- """ 精准解析:完全同步自 req.py 的模型获取逻辑 """
|
|
|
+ """ 深度解析详情页,支持款式和逐条价格获取 """
|
|
|
try:
|
|
|
self.driver.get(url)
|
|
|
- time.sleep(random.uniform(5, 8))
|
|
|
+ time.sleep(random.uniform(5, 10))
|
|
|
self.check_for_captcha()
|
|
|
- # 执行 JS 获取核心模型 (完全对标 req.py)
|
|
|
model = self.driver.execute_script(
|
|
|
- "return (window.context && window.context.result && "
|
|
|
- "window.context.result.global && window.context.result.global.globalData "
|
|
|
- "&& window.context.result.global.globalData.model) || "
|
|
|
- "window.__INITIAL_DATA__ || window.iDetailData || window.iDetailConfig || null;"
|
|
|
+ "return (window.context && window.context.result && window.context.result.global && window.context.result.global.globalData && window.context.result.global.globalData.model) || window.__INITIAL_DATA__ || window.iDetailData || window.iDetailConfig || null;"
|
|
|
)
|
|
|
if not model: return None
|
|
|
|
|
|
@@ -196,7 +175,8 @@ class Scraper1688:
|
|
|
return ""
|
|
|
|
|
|
trade = model.get("tradeModel", {}) if isinstance(model, dict) else {}
|
|
|
- range_text = " / ".join([f"{r.get('beginAmount')}起 ¥{r.get('price') or r.get('discountPrice')}" for r in (trade.get("disPriceRanges") or trade.get("currentPrices") or [])])
|
|
|
+ ranges = trade.get("disPriceRanges") or trade.get("currentPrices") or []
|
|
|
+ range_text = " / ".join([f"{r.get('beginAmount')}起 ¥{r.get('price') or r.get('discountPrice')}" for r in ranges])
|
|
|
|
|
|
base_data = {
|
|
|
"category": (model.get("offerDetail", {}).get("leafCategoryName", "") if isinstance(model, dict) else "") or self.driver.find_element(By.CSS_SELECTOR, "div[class*=breadcrumb] a:last-child").text.strip(),
|
|
|
@@ -210,7 +190,6 @@ class Scraper1688:
|
|
|
|
|
|
variant_data_list = []
|
|
|
try:
|
|
|
- # 方案 A: 优先使用 expand-view-list-wrapper 获取款式和价格
|
|
|
wrappers = self.driver.find_elements(By.CLASS_NAME, "expand-view-list-wrapper")
|
|
|
if wrappers:
|
|
|
items = wrappers[0].find_elements(By.CSS_SELECTOR, ".expand-view-list-item, [class*='list-item'], .sku-item")
|
|
|
@@ -218,22 +197,24 @@ class Scraper1688:
|
|
|
try:
|
|
|
label = item_el.find_element(By.CLASS_NAME, "item-label").text.strip()
|
|
|
price = item_el.find_element(By.CLASS_NAME, "item-price-stock").text.strip()
|
|
|
- if label: variant_data_list.append({"label": label, "price": re.sub(r'[^\d.]', '', price)})
|
|
|
+ if label:
|
|
|
+ variant_data_list.append({"label": label, "price": re.sub(r'[^\d.]', '', price)})
|
|
|
except: continue
|
|
|
except: pass
|
|
|
|
|
|
if variant_data_list:
|
|
|
results = []
|
|
|
for vd in variant_data_list:
|
|
|
- row = base_data.copy(); row["color"] = vd["label"]; row["price"] = vd["price"]; results.append(row)
|
|
|
+ row = base_data.copy()
|
|
|
+ row["color"] = vd["label"]
|
|
|
+ row["price"] = vd["price"]
|
|
|
+ results.append(row)
|
|
|
return results
|
|
|
return [base_data]
|
|
|
except: return None
|
|
|
|
|
|
def _extract_all_methods(self):
|
|
|
- """ 强化版:全力探测 1688 列表页数据 (对标 req.py) """
|
|
|
results = []
|
|
|
- # 1. 深度内存变量扫描
|
|
|
scripts = [
|
|
|
"return JSON.stringify(window.data || window.context?.result?.data || window.__INITIAL_DATA__)",
|
|
|
"return JSON.stringify(window.context?.result?.global?.globalData?.data || null)"
|
|
|
@@ -255,8 +236,7 @@ class Scraper1688:
|
|
|
if link: results.append({"name": str(o.get('title', o.get('subject', ''))), "link": link})
|
|
|
if results: return results
|
|
|
except: continue
|
|
|
- # 2. 暴力 DOM 选择器保底
|
|
|
- for s in [".sm-offer-item", ".offer-card-item", ".pc-search-offer-item", "[class*='offer-card']", ".offer-item"]:
|
|
|
+ for s in [".sm-offer-item", ".offer-card-item", "[class*='offer-card']", ".offer-item"]:
|
|
|
for el in self.driver.find_elements(By.CSS_SELECTOR, s):
|
|
|
try:
|
|
|
a = el.find_element(By.TAG_NAME, "a")
|