scraper.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. # 针对 Python 3.12+ 移除 distutils 的兼容性补丁
  2. import sys
  3. try:
  4. import distutils
  5. except ImportError:
  6. from types import ModuleType
  7. d, v = ModuleType("distutils"), ModuleType("distutils.version")
  8. d.version = v
  9. sys.modules.update({"distutils": d, "distutils.version": v})
  10. class LooseVersion:
  11. def __init__(self, v): self.v = v
  12. def __lt__(self, o): return True
  13. def __str__(self): return str(self.v)
  14. v.LooseVersion = LooseVersion
  15. import time, random, re, os, subprocess, urllib.parse, json, traceback, socket
  16. from selenium import webdriver
  17. from selenium.webdriver.edge.options import Options as EdgeOptions
  18. from selenium.webdriver.edge.service import Service as EdgeService
  19. from selenium.webdriver.chrome.options import Options as ChromeOptions
  20. import undetected_chromedriver as uc
  21. from selenium.webdriver.common.by import By
  22. from selenium.webdriver.common.action_chains import ActionChains
  23. from selenium_stealth import stealth
  24. class Scraper1688:
  25. def __init__(self, headless=True, status_callback=None, log_callback=None):
  26. self.headless = headless
  27. self.status_callback = status_callback
  28. self.log_callback = log_callback # 新增:用于向 GUI 发送普通日志
  29. self.user_data_path = os.path.abspath(os.path.join(os.getcwd(), "1688_user_data"))
  30. self.driver = None
  31. edge_path = self._find_edge()
  32. if edge_path:
  33. print(f"[*] 【极致稳定模式】正在启动 Edge 深度伪装环境...")
  34. self._cleanup_processes()
  35. # 使用固定且持久的 Session 目录,确保长效免登录
  36. edge_user_data = os.path.join(os.getcwd(), "1688_edge_ultimate_session")
  37. cmd = [
  38. edge_path,
  39. "--remote-debugging-port=9222",
  40. f"--user-data-dir={edge_user_data}",
  41. "--no-first-run",
  42. "--no-default-browser-check",
  43. "--disable-blink-features=AutomationControlled"
  44. ]
  45. if headless: cmd.append("--headless")
  46. try:
  47. subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
  48. time.sleep(6)
  49. opts = EdgeOptions()
  50. opts.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
  51. self.driver = webdriver.Edge(options=opts)
  52. print("[+] Edge 极致稳定环境接管成功!")
  53. except Exception as e:
  54. print(f"[!] Edge 启动失败: {e}")
  55. if not self.driver:
  56. self._init_chrome(headless)
  57. if self.driver:
  58. # 深度擦除自动化指纹
  59. try:
  60. self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
  61. "source": """
  62. Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
  63. Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] });
  64. Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
  65. """
  66. })
  67. except: pass
  68. def _find_edge(self):
  69. import winreg
  70. reg_paths = [(winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\msedge.exe"), (winreg.HKEY_CURRENT_USER, r"SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\msedge.exe")]
  71. for hkey, subkey in reg_paths:
  72. try:
  73. with winreg.OpenKey(hkey, subkey) as key:
  74. path, _ = winreg.QueryValueEx(key, "")
  75. if os.path.exists(path): return path
  76. except: continue
  77. return None
  78. def _cleanup_processes(self):
  79. if os.name == 'nt':
  80. for proc in ['msedge.exe', 'msedgedriver.exe', 'chromedriver.exe']:
  81. subprocess.call(['taskkill', '/F', '/IM', proc, '/T'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
  82. def _init_chrome(self, headless):
  83. def create_options():
  84. opts = uc.ChromeOptions()
  85. opts.add_argument(f"--user-data-dir={self.user_data_path}")
  86. return opts
  87. self.driver = uc.Chrome(options=create_options(), headless=headless)
  88. def check_for_captcha(self):
  89. def is_blocked():
  90. try:
  91. url, src = self.driver.current_url.lower(), self.driver.page_source.lower()
  92. sliders = self.driver.find_elements(By.ID, "nc_1_n1z")
  93. return (len(sliders) > 0 and sliders[0].is_displayed()) or "login.1688.com" in url or "punish" in url or "哎哟喂" in src
  94. except: return False
  95. if is_blocked():
  96. msg = "请登录验证"
  97. if self.status_callback: self.status_callback(True, msg)
  98. while is_blocked(): time.sleep(3)
  99. if self.status_callback: self.status_callback(False, "验证通过")
  100. cool_msg = "[*] 监测到干预完成,进入 120 秒深度冷却期以重置风控权重..."
  101. print(cool_msg)
  102. if self.log_callback: self.log_callback(f"<font color='orange'>{cool_msg}</font>")
  103. time.sleep(120)
  104. return True
  105. def _human_behavior(self, duration=10):
  106. """ 高级拟人化行为模拟 """
  107. start_time = time.time()
  108. while time.time() - start_time < duration:
  109. try:
  110. # 1. 随机滚动
  111. scroll_y = random.randint(200, 600)
  112. self.driver.execute_script(f"window.scrollBy(0, {scroll_y});")
  113. # 2. 随机鼠标晃动
  114. actions = ActionChains(self.driver)
  115. actions.move_by_offset(random.randint(-5, 5), random.randint(-5, 5)).perform()
  116. time.sleep(random.uniform(1.5, 4.0))
  117. # 3. 概率性往回滚
  118. if random.random() > 0.7:
  119. self.driver.execute_script(f"window.scrollBy(0, -{random.randint(100, 300)});")
  120. except: break
  121. def search_products_yield(self, keyword, total_count=200, existing_links=None):
  122. gbk_keyword = urllib.parse.quote(keyword, encoding='gbk')
  123. base_url = f"https://s.1688.com/selloffer/offer_search.htm?keywords={gbk_keyword}&n=y&netType=1%2C11%2C16"
  124. self.driver.get("https://www.1688.com")
  125. time.sleep(random.randint(3, 6))
  126. self.check_for_captcha()
  127. all_links = existing_links if existing_links is not None else set()
  128. page, initial_count = 1, len(all_links)
  129. # 随机设定下一次深度冷却的阈值 (5-12条之间)
  130. next_cool_threshold = random.randint(5, 12)
  131. while len(all_links) < total_count + initial_count:
  132. print(f"[*] 正在模拟搜索: 第 {page} 页...")
  133. self.driver.get(f"{base_url}&beginPage={page}&page={page}")
  134. self.check_for_captcha()
  135. # 列表页模拟“翻找”行为
  136. for _ in range(random.randint(5, 8)):
  137. self.driver.execute_script(f"window.scrollBy(0, {random.randint(400, 800)});")
  138. time.sleep(random.uniform(1.5, 3.5))
  139. if random.random() > 0.8:
  140. self.driver.execute_script("window.scrollBy(0, -300);")
  141. page_results = self._extract_all_methods()
  142. page_batch = []
  143. for it in page_results:
  144. clean_url = self.clean_url(it["link"])
  145. if clean_url and clean_url not in all_links:
  146. all_links.add(clean_url)
  147. # --- 核心订正:随机深度冷却 ---
  148. new_processed = len(all_links) - initial_count
  149. if new_processed >= next_cool_threshold:
  150. rest = random.randint(120, 300)
  151. cool_msg = f"[*] 随机触发深度保护 (已处理{new_processed}条),睡眠 {rest} 秒模拟休息..."
  152. print(cool_msg)
  153. if self.log_callback: self.log_callback(f"<font color='orange'><b>{cool_msg}</b></font>")
  154. time.sleep(rest)
  155. next_cool_threshold += random.randint(5, 12) # 设定下一个随机检查点
  156. print(f" [>] 详情仿真采集: {clean_url}")
  157. # 访问前大幅随机停顿
  158. time.sleep(random.uniform(5, 12))
  159. detail_results = self.scrape_detail(clean_url)
  160. if detail_results: page_batch.extend(detail_results)
  161. else: page_batch.append({"link": clean_url, "name": it["name"]})
  162. if len(page_batch) >= 10:
  163. yield page_batch
  164. page_batch = []
  165. # 详情页之间的大跨度等待
  166. time.sleep(random.uniform(30, 60))
  167. if len(all_links) >= total_count + initial_count: break
  168. if page_batch: yield page_batch
  169. page += 1
  170. # 每翻 3 页随机回一次 1688 首页,消除路径单一性
  171. if page % 3 == 0:
  172. self.driver.get("https://www.1688.com")
  173. time.sleep(random.randint(10, 20))
  174. return list(all_links)
  175. def scrape_detail(self, url):
  176. try:
  177. self.driver.get(url)
  178. # --- 核心改进:详情页留存仿真 ---
  179. self._human_behavior(duration=random.randint(12, 25))
  180. self.check_for_captcha()
  181. model = self.driver.execute_script(
  182. "return (window.context && window.context.result && window.context.result.global && window.context.result.global.globalData && window.context.result.global.globalData.model) || window.__INITIAL_DATA__ || window.iDetailData || window.iDetailConfig || null;"
  183. )
  184. if not model: return None
  185. def get_attr(name):
  186. try:
  187. attrs = model.get("offerDetail", {}).get("featureAttributes", [])
  188. for item in attrs:
  189. if name in item.get("name", ""): return item.get("value", "")
  190. attrs = model.get("detailData", {}).get("attributes", [])
  191. for item in attrs:
  192. if name in item.get("attributeName", ""): return item.get("value", "")
  193. except: pass
  194. return ""
  195. trade = model.get("tradeModel", {}) if isinstance(model, dict) else {}
  196. price_min = trade.get("minPrice", "") or ""
  197. if not price_min:
  198. try: price_min = model["sku"]["priceRange"][0][1]
  199. except: pass
  200. ranges = trade.get("disPriceRanges") or trade.get("currentPrices") or []
  201. range_text = " / ".join([f"{r.get('beginAmount')}起 ¥{r.get('price') or r.get('discountPrice')}" for r in ranges])
  202. base_data = {
  203. "category": (model.get("offerDetail", {}).get("leafCategoryName", "") if isinstance(model, dict) else "") or self.driver.find_element(By.CSS_SELECTOR, "div[class*=breadcrumb] a:last-child").text.strip(),
  204. "brand": get_attr("品牌"),
  205. "name": (model.get("offerDetail", {}).get("subject", "") if isinstance(model, dict) else "") or self.driver.title.split('-')[0],
  206. "spec": get_attr("尺码") or get_attr("规格") or get_attr("型号"),
  207. "material": get_attr("材质") or get_attr("面料"),
  208. "price": price_min,
  209. "moq": trade.get("beginAmount", ""),
  210. "wholesale_price": range_text,
  211. "link": url,
  212. "supplier": (model.get("sellerModel", {}).get("companyName", "") if isinstance(model, dict) else ""),
  213. }
  214. sku_props = model.get("skuModel", {}).get("skuProps", []) or model.get("detailData", {}).get("skuProps", []) or []
  215. main_prop = next((p for p in sku_props if any(k in p.get("prop", "") for k in ["颜色", "分类", "款式", "花色"])), None)
  216. if not main_prop and sku_props: main_prop = sku_props[0]
  217. if main_prop and main_prop.get("value"):
  218. results = []
  219. for val in main_prop["value"]:
  220. if val.get("name"):
  221. row = base_data.copy()
  222. row["color"] = val.get("name")
  223. results.append(row)
  224. return results
  225. return [base_data]
  226. except: return None
  227. def clean_url(self, url):
  228. if not url: return ""
  229. id_match = re.search(r'offer/(\d+)\.html', url)
  230. if id_match: return f"https://detail.1688.com/offer/{id_match.group(1)}.html"
  231. return url
  232. def _extract_all_methods(self):
  233. results = []
  234. try:
  235. res = self.driver.execute_script("return JSON.stringify(window.data || window.__INITIAL_DATA__)")
  236. if res:
  237. data = json.loads(res)
  238. def find_list(obj):
  239. if isinstance(obj, list) and len(obj) > 0 and ('title' in obj[0] or 'offerId' in obj[0]): return obj
  240. if isinstance(obj, dict):
  241. for k in obj:
  242. f = find_list(obj[k])
  243. if f: return f
  244. return None
  245. for o in (find_list(data) or []):
  246. link = o.get('itemUrl', o.get('url', ''))
  247. if link: results.append({"name": str(o.get('title', '')), "link": link})
  248. except: pass
  249. if not results:
  250. for s in [".search-offer-item", "[class*='offer-card']", ".offer-item"]:
  251. for el in self.driver.find_elements(By.CSS_SELECTOR, s):
  252. try:
  253. link = el.find_element(By.TAG_NAME, "a").get_attribute("href")
  254. if link: results.append({"name": el.text.split('\n')[0][:50], "link": link})
  255. except: continue
  256. if results: break
  257. return results
  258. def quit(self):
  259. try: self.driver.quit()
  260. except: pass