|
@@ -44,6 +44,29 @@ class Scraper1688:
|
|
|
except: continue
|
|
except: continue
|
|
|
return None
|
|
return None
|
|
|
|
|
|
|
|
|
|
+ def _get_chrome_version_main(self):
|
|
|
|
|
+ """从 Windows 注册表读取本机 Chrome 主版本号(如 145),用于匹配 ChromeDriver,避免版本不一致。"""
|
|
|
|
|
+ if os.name != "nt":
|
|
|
|
|
+ return None
|
|
|
|
|
+ import winreg
|
|
|
|
|
+ reg_paths = [
|
|
|
|
|
+ (winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Wow6432Node\Microsoft\Windows\CurrentVersion\Uninstall\Google Chrome"),
|
|
|
|
|
+ (winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Microsoft\Windows\CurrentVersion\Uninstall\Google Chrome"),
|
|
|
|
|
+ (winreg.HKEY_CURRENT_USER, r"SOFTWARE\Microsoft\Windows\CurrentVersion\Uninstall\Google Chrome"),
|
|
|
|
|
+ ]
|
|
|
|
|
+ for hkey, subkey in reg_paths:
|
|
|
|
|
+ for value_name in ("DisplayVersion", "version"):
|
|
|
|
|
+ try:
|
|
|
|
|
+ with winreg.OpenKey(hkey, subkey) as key:
|
|
|
|
|
+ ver, _ = winreg.QueryValueEx(key, value_name)
|
|
|
|
|
+ if ver:
|
|
|
|
|
+ main = int(str(ver).split(".")[0])
|
|
|
|
|
+ if 80 <= main <= 200:
|
|
|
|
|
+ return main
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ continue
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
def _cleanup(self):
|
|
def _cleanup(self):
|
|
|
if os.name == 'nt':
|
|
if os.name == 'nt':
|
|
|
for proc in ['chrome.exe', 'chromedriver.exe']:
|
|
for proc in ['chrome.exe', 'chromedriver.exe']:
|
|
@@ -58,6 +81,7 @@ class Scraper1688:
|
|
|
|
|
|
|
|
def _init_chrome(self, headless):
|
|
def _init_chrome(self, headless):
|
|
|
chrome_path = self._find_chrome()
|
|
chrome_path = self._find_chrome()
|
|
|
|
|
+ version_main = self._get_chrome_version_main()
|
|
|
def create_options():
|
|
def create_options():
|
|
|
opts = uc.ChromeOptions()
|
|
opts = uc.ChromeOptions()
|
|
|
opts.add_argument(f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')
|
|
opts.add_argument(f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')
|
|
@@ -69,12 +93,16 @@ class Scraper1688:
|
|
|
opts.add_argument("--disable-dev-shm-usage")
|
|
opts.add_argument("--disable-dev-shm-usage")
|
|
|
opts.add_argument("--remote-allow-origins=*")
|
|
opts.add_argument("--remote-allow-origins=*")
|
|
|
return opts
|
|
return opts
|
|
|
|
|
+ kwargs = {"options": create_options(), "headless": headless, "use_subprocess": True}
|
|
|
|
|
+ if version_main is not None:
|
|
|
|
|
+ kwargs["version_main"] = version_main
|
|
|
|
|
+ if chrome_path:
|
|
|
|
|
+ kwargs["browser_executable_path"] = chrome_path
|
|
|
try:
|
|
try:
|
|
|
- self.driver = uc.Chrome(options=create_options(), headless=headless, browser_executable_path=chrome_path, use_subprocess=True)
|
|
|
|
|
- # self.driver = uc.Chrome(options=create_options(), headless=headless, browser_executable_path=chrome_path, use_subprocess=True)
|
|
|
|
|
- except :
|
|
|
|
|
- # except Exception:
|
|
|
|
|
- self.driver = uc.Chrome(options=create_options(), headless=headless, use_subprocess=True)
|
|
|
|
|
|
|
+ self.driver = uc.Chrome(**kwargs)
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ kwargs.pop("version_main", None)
|
|
|
|
|
+ self.driver = uc.Chrome(**kwargs)
|
|
|
|
|
|
|
|
def clean_url(self, url):
|
|
def clean_url(self, url):
|
|
|
if not url: return ""
|
|
if not url: return ""
|
|
@@ -111,14 +139,14 @@ class Scraper1688:
|
|
|
self.driver.get(f"{base_url}&beginPage={page}&page={page}")
|
|
self.driver.get(f"{base_url}&beginPage={page}&page={page}")
|
|
|
self.check_for_captcha()
|
|
self.check_for_captcha()
|
|
|
|
|
|
|
|
- # --- 关键:脉冲式分段滚动,强制触发懒加载 ---
|
|
|
|
|
- for i in range(1, 16):
|
|
|
|
|
- self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/15});")
|
|
|
|
|
- time.sleep(random.uniform(1.2, 2.5))
|
|
|
|
|
- if i % 4 == 0:
|
|
|
|
|
- self.driver.execute_script(f"window.scrollBy(0, -400);")
|
|
|
|
|
- time.sleep(1.0)
|
|
|
|
|
- time.sleep(random.uniform(3, 6))
|
|
|
|
|
|
|
+ # --- 脉冲式分段滚动触发懒加载(已压缩等待,兼顾速度与加载)---
|
|
|
|
|
+ for i in range(1, 9):
|
|
|
|
|
+ self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/8});")
|
|
|
|
|
+ time.sleep(random.uniform(0.35, 0.85))
|
|
|
|
|
+ if i % 3 == 0:
|
|
|
|
|
+ self.driver.execute_script("window.scrollBy(0, -300);")
|
|
|
|
|
+ time.sleep(0.4)
|
|
|
|
|
+ time.sleep(random.uniform(1.2, 2.8))
|
|
|
|
|
|
|
|
page_results = self._extract_all_methods()
|
|
page_results = self._extract_all_methods()
|
|
|
print(f" [+] 本页解析完成:共发现 {len(page_results)} 个商品链接")
|
|
print(f" [+] 本页解析完成:共发现 {len(page_results)} 个商品链接")
|
|
@@ -146,13 +174,13 @@ class Scraper1688:
|
|
|
yield page_batch
|
|
yield page_batch
|
|
|
page_batch = []
|
|
page_batch = []
|
|
|
|
|
|
|
|
- time.sleep(random.uniform(40, 80))
|
|
|
|
|
|
|
+ time.sleep(random.uniform(15, 35))
|
|
|
if len(all_links) >= total_count + initial_count: break
|
|
if len(all_links) >= total_count + initial_count: break
|
|
|
|
|
|
|
|
if page_batch: yield page_batch
|
|
if page_batch: yield page_batch
|
|
|
page += 1
|
|
page += 1
|
|
|
self.driver.get("https://www.1688.com")
|
|
self.driver.get("https://www.1688.com")
|
|
|
- time.sleep(60)
|
|
|
|
|
|
|
+ time.sleep(random.uniform(18, 38))
|
|
|
return list(all_links)
|
|
return list(all_links)
|
|
|
|
|
|
|
|
def scrape_detail(self, url):
|
|
def scrape_detail(self, url):
|
|
@@ -171,7 +199,7 @@ class Scraper1688:
|
|
|
for btn in expand_btns:
|
|
for btn in expand_btns:
|
|
|
if btn.is_displayed():
|
|
if btn.is_displayed():
|
|
|
self.driver.execute_script("arguments[0].click();", btn)
|
|
self.driver.execute_script("arguments[0].click();", btn)
|
|
|
- time.sleep(1.5)
|
|
|
|
|
|
|
+ time.sleep(0.6)
|
|
|
except: pass
|
|
except: pass
|
|
|
|
|
|
|
|
self.check_for_captcha()
|
|
self.check_for_captcha()
|