|
|
@@ -34,10 +34,7 @@ class Scraper1688:
|
|
|
|
|
|
def _find_chrome(self):
|
|
|
import winreg
|
|
|
- reg_paths = [
|
|
|
- (winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\chrome.exe"),
|
|
|
- (winreg.HKEY_CURRENT_USER, r"SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\chrome.exe")
|
|
|
- ]
|
|
|
+ reg_paths = [(winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\chrome.exe"), (winreg.HKEY_CURRENT_USER, r"SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\chrome.exe")]
|
|
|
for hkey, subkey in reg_paths:
|
|
|
try:
|
|
|
with winreg.OpenKey(hkey, subkey) as key:
|
|
|
@@ -73,7 +70,7 @@ class Scraper1688:
|
|
|
return opts
|
|
|
try:
|
|
|
self.driver = uc.Chrome(options=create_options(), headless=headless, browser_executable_path=chrome_path, use_subprocess=True)
|
|
|
- except:
|
|
|
+ except Exception:
|
|
|
self.driver = uc.Chrome(options=create_options(), headless=headless, use_subprocess=True)
|
|
|
|
|
|
def clean_url(self, url):
|
|
|
@@ -112,37 +109,29 @@ class Scraper1688:
|
|
|
print(f"[*] 正在处理列表页: 第 {page} 页...")
|
|
|
self.driver.get(f"{base_url}&beginPage={page}&page={page}")
|
|
|
self.check_for_captcha()
|
|
|
- for i in range(1, 9):
|
|
|
- self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/8});")
|
|
|
- time.sleep(random.uniform(1.2, 2.5))
|
|
|
- if i == 4:
|
|
|
- self.driver.execute_script("window.scrollBy(0, -400);")
|
|
|
- time.sleep(1.0)
|
|
|
- time.sleep(5)
|
|
|
+ for i in range(1, 6):
|
|
|
+ self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/5});")
|
|
|
+ time.sleep(1.5)
|
|
|
|
|
|
page_results = self._extract_all_methods()
|
|
|
- print(f" [+] 本页发现 {len(page_results)} 个商品原始条目")
|
|
|
+ print(f" [+] 本页发现 {len(page_results)} 个原始条目")
|
|
|
|
|
|
page_batch = []
|
|
|
for it in page_results:
|
|
|
- clean_url = self.clean_url(it.get("link"))
|
|
|
+ clean_url = self.clean_url(it["link"])
|
|
|
if clean_url and clean_url not in all_links:
|
|
|
all_links.add(clean_url)
|
|
|
- print(f" [>] 正在启动详情抓取: {clean_url}")
|
|
|
+ print(f" [>] 正在执行详情抓取流程: {clean_url}")
|
|
|
detail_results = self.scrape_detail(clean_url)
|
|
|
if detail_results:
|
|
|
page_batch.extend(detail_results)
|
|
|
else:
|
|
|
- page_batch.append({
|
|
|
- "category": "", "brand": "", "name": it.get("name", "未知"),
|
|
|
- "color": "", "spec": "", "material": "", "price": "",
|
|
|
- "moq": "", "wholesale_price": "", "link": clean_url, "supplier": ""
|
|
|
- })
|
|
|
+ page_batch.append({"category": "", "brand": "", "name": it.get("name", "未知"), "color": "", "spec": "", "material": "", "price": "", "moq": "", "wholesale_price": "", "link": clean_url, "supplier": ""})
|
|
|
|
|
|
if len(page_batch) >= 10:
|
|
|
yield page_batch
|
|
|
page_batch = []
|
|
|
- time.sleep(random.uniform(15, 25))
|
|
|
+ time.sleep(random.uniform(5, 10))
|
|
|
if len(all_links) >= total_count + initial_count: break
|
|
|
|
|
|
if page_batch: yield page_batch
|
|
|
@@ -153,10 +142,9 @@ class Scraper1688:
|
|
|
return list(all_links)
|
|
|
|
|
|
def scrape_detail(self, url):
|
|
|
- """ 深度解析详情页,支持款式和逐条价格获取 """
|
|
|
try:
|
|
|
self.driver.get(url)
|
|
|
- time.sleep(random.uniform(5, 10))
|
|
|
+ time.sleep(random.uniform(5, 8))
|
|
|
self.check_for_captcha()
|
|
|
model = self.driver.execute_script(
|
|
|
"return (window.context && window.context.result && window.context.result.global && window.context.result.global.globalData && window.context.result.global.globalData.model) || window.__INITIAL_DATA__ || window.iDetailData || window.iDetailConfig || null;"
|
|
|
@@ -197,28 +185,21 @@ class Scraper1688:
|
|
|
try:
|
|
|
label = item_el.find_element(By.CLASS_NAME, "item-label").text.strip()
|
|
|
price = item_el.find_element(By.CLASS_NAME, "item-price-stock").text.strip()
|
|
|
- if label:
|
|
|
- variant_data_list.append({"label": label, "price": re.sub(r'[^\d.]', '', price)})
|
|
|
+ if label: variant_data_list.append({"label": label, "price": re.sub(r'[^\d.]', '', price)})
|
|
|
except: continue
|
|
|
except: pass
|
|
|
|
|
|
if variant_data_list:
|
|
|
results = []
|
|
|
for vd in variant_data_list:
|
|
|
- row = base_data.copy()
|
|
|
- row["color"] = vd["label"]
|
|
|
- row["price"] = vd["price"]
|
|
|
- results.append(row)
|
|
|
+ row = base_data.copy(); row["color"] = vd["label"]; row["price"] = vd["price"]; results.append(row)
|
|
|
return results
|
|
|
return [base_data]
|
|
|
except: return None
|
|
|
|
|
|
def _extract_all_methods(self):
|
|
|
results = []
|
|
|
- scripts = [
|
|
|
- "return JSON.stringify(window.data || window.context?.result?.data || window.__INITIAL_DATA__)",
|
|
|
- "return JSON.stringify(window.context?.result?.global?.globalData?.data || null)"
|
|
|
- ]
|
|
|
+ scripts = ["return JSON.stringify(window.data || window.context?.result?.data || window.__INITIAL_DATA__)", "return JSON.stringify(window.context?.result?.global?.globalData?.data || null)"]
|
|
|
for s in scripts:
|
|
|
try:
|
|
|
res = self.driver.execute_script(s)
|
|
|
@@ -236,14 +217,19 @@ class Scraper1688:
|
|
|
if link: results.append({"name": str(o.get('title', o.get('subject', ''))), "link": link})
|
|
|
if results: return results
|
|
|
except: continue
|
|
|
- for s in [".sm-offer-item", ".offer-card-item", "[class*='offer-card']", ".offer-item"]:
|
|
|
- for el in self.driver.find_elements(By.CSS_SELECTOR, s):
|
|
|
- try:
|
|
|
- a = el.find_element(By.TAG_NAME, "a")
|
|
|
- link = a.get_attribute("href")
|
|
|
- if link and "1688.com" in link: results.append({"name": el.text.split('\n')[0][:50], "link": link})
|
|
|
- except: continue
|
|
|
- if results: break
|
|
|
+
|
|
|
+ links = self.driver.find_elements(By.TAG_NAME, "a")
|
|
|
+ seen_ids = set()
|
|
|
+ for l in links:
|
|
|
+ try:
|
|
|
+ href = l.get_attribute("href")
|
|
|
+ if href:
|
|
|
+ id_match = re.search(r'offer/(\d{9,15})\.html', href)
|
|
|
+ if id_match:
|
|
|
+ oid = id_match.group(1)
|
|
|
+ if oid not in seen_ids:
|
|
|
+ seen_ids.add(oid); results.append({"name": l.text.split('\n')[0][:50] or f"商品-{oid}", "link": href})
|
|
|
+ except: continue
|
|
|
return results
|
|
|
|
|
|
def quit(self):
|