scraper.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. # 针对 Python 3.12+ 移除 distutils 的兼容性补丁
  2. import sys
  3. try:
  4. import distutils
  5. except ImportError:
  6. from types import ModuleType
  7. d, v = ModuleType("distutils"), ModuleType("distutils.version")
  8. d.version = v
  9. sys.modules.update({"distutils": d, "distutils.version": v})
  10. class LooseVersion:
  11. def __init__(self, v): self.v = v
  12. def __lt__(self, o): return True
  13. def __str__(self): return str(self.v)
  14. v.LooseVersion = LooseVersion
  15. import time, random, re, os, subprocess, urllib.parse, json, traceback, socket
  16. from selenium import webdriver
  17. from selenium.webdriver.edge.options import Options as EdgeOptions
  18. from selenium.webdriver.edge.service import Service as EdgeService
  19. from selenium.webdriver.chrome.options import Options as ChromeOptions
  20. import undetected_chromedriver as uc
  21. from selenium.webdriver.common.by import By
  22. from selenium.webdriver.common.action_chains import ActionChains
  23. from selenium_stealth import stealth
  24. class Scraper1688:
  25. def __init__(self, headless=True, status_callback=None):
  26. self.headless = headless
  27. self.status_callback = status_callback
  28. self.user_data_path = os.path.abspath(os.path.join(os.getcwd(), "1688_user_data"))
  29. self.driver = None
  30. edge_path = self._find_edge()
  31. if edge_path:
  32. print(f"[*] 检测到 Edge: {edge_path},正在全自动启动并接管...")
  33. self._cleanup_processes()
  34. edge_user_data = os.path.join(os.getcwd(), "1688_edge_profile")
  35. cmd = [edge_path, "--remote-debugging-port=9222", f"--user-data-dir={edge_user_data}", "--no-first-run", "--no-default-browser-check"]
  36. if headless: cmd.append("--headless")
  37. try:
  38. subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
  39. time.sleep(3)
  40. opts = EdgeOptions()
  41. opts.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
  42. try:
  43. self.driver = webdriver.Edge(options=opts)
  44. print("[+] Edge 浏览器已成功接管!")
  45. except:
  46. from webdriver_manager.microsoft import EdgeChromiumDriverManager
  47. service = EdgeService(EdgeChromiumDriverManager().install())
  48. self.driver = webdriver.Edge(service=service, options=opts)
  49. print("[+] Edge 浏览器已通过驱动管理接管!")
  50. except Exception as e:
  51. print(f"[*] Edge 启动失败: {e}")
  52. if not self.driver:
  53. print("[*] 正在启动 Chrome (undetected-chromedriver) 模式...")
  54. self._init_chrome(headless)
  55. if self.driver:
  56. if "edge" not in str(type(self.driver)).lower():
  57. stealth(self.driver, languages=["zh-CN", "zh"], vendor="Google Inc.", platform="Win32", fix_hairline=True)
  58. else:
  59. self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
  60. "source": "Object.defineProperty(navigator, 'webdriver', { get: () => undefined });"
  61. })
  62. def _find_edge(self):
  63. import winreg
  64. reg_paths = [(winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\msedge.exe"), (winreg.HKEY_CURRENT_USER, r"SOFTWARE\Microsoft\Windows\CurrentVersion\App Paths\msedge.exe")]
  65. for hkey, subkey in reg_paths:
  66. try:
  67. with winreg.OpenKey(hkey, subkey) as key:
  68. path, _ = winreg.QueryValueEx(key, "")
  69. if os.path.exists(path): return path
  70. except: continue
  71. return None
  72. def _cleanup_processes(self):
  73. if os.name == 'nt':
  74. for proc in ['msedge.exe', 'msedgedriver.exe', 'chromedriver.exe']:
  75. subprocess.call(['taskkill', '/F', '/IM', proc, '/T'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
  76. def _init_chrome(self, headless):
  77. def create_options():
  78. opts = uc.ChromeOptions()
  79. opts.add_argument(f"--user-data-dir={self.user_data_path}")
  80. if headless: opts.add_argument('--headless=new')
  81. opts.add_argument('--disable-blink-features=AutomationControlled')
  82. return opts
  83. try: self.driver = uc.Chrome(options=create_options(), headless=headless)
  84. except: self.driver = uc.Chrome(options=create_options(), headless=headless)
  85. def clean_url(self, url):
  86. if not url: return ""
  87. if url.startswith("//"): url = "https:" + url
  88. id_match = re.search(r'offer/(\d+)\.html', url)
  89. if id_match: return f"https://detail.1688.com/offer/{id_match.group(1)}.html"
  90. parsed = urllib.parse.urlparse(url)
  91. params = urllib.parse.parse_qs(parsed.query)
  92. oid = params.get('offerId') or params.get('id')
  93. if oid: return f"https://detail.1688.com/offer/{oid[0]}.html"
  94. return url
  95. def check_for_captcha(self):
  96. def is_blocked():
  97. try:
  98. url, src, title = self.driver.current_url.lower(), self.driver.page_source.lower(), self.driver.title.lower()
  99. sliders = self.driver.find_elements(By.ID, "nc_1_n1z")
  100. is_slider = len(sliders) > 0 and sliders[0].is_displayed()
  101. is_login = "login.1688.com" in url or "passport.1688.com" in url
  102. is_punish = "punish" in url or "哎哟喂" in src or "验证码" in title
  103. return is_slider or is_login or is_punish
  104. except: return False
  105. if is_blocked():
  106. msg = "请登录验证"
  107. if self.status_callback: self.status_callback(True, msg)
  108. while is_blocked(): time.sleep(2)
  109. if self.status_callback: self.status_callback(False, "验证通过")
  110. time.sleep(3)
  111. return True
  112. def search_products_yield(self, keyword, total_count=200, existing_links=None):
  113. gbk_keyword = urllib.parse.quote(keyword, encoding='gbk')
  114. base_url = f"https://s.1688.com/selloffer/offer_search.htm?keywords={gbk_keyword}&n=y&netType=1%2C11%2C16"
  115. self.driver.get("https://www.1688.com")
  116. self.check_for_captcha()
  117. all_links = existing_links if existing_links is not None else set()
  118. page, initial_count = 1, len(all_links)
  119. while len(all_links) < total_count + initial_count:
  120. print(f"[*] 正在处理列表页: 第 {page} 页...")
  121. self.driver.get(f"{base_url}&beginPage={page}&page={page}")
  122. self.check_for_captcha()
  123. # --- 优化:模拟人类不均匀滚动,降低滑块频率 ---
  124. scroll_steps = random.randint(5, 10)
  125. for i in range(1, scroll_steps + 1):
  126. self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/scroll_steps});")
  127. time.sleep(random.uniform(0.5, 2.0))
  128. page_results = self._extract_all_methods()
  129. if not page_results:
  130. print(f"[!] 第 {page} 页无结果,尝试刷新...")
  131. self.driver.refresh()
  132. time.sleep(5)
  133. page_results = self._extract_all_methods()
  134. page_batch = []
  135. for it in page_results:
  136. clean_url = self.clean_url(it["link"])
  137. if clean_url and clean_url not in all_links:
  138. all_links.add(clean_url)
  139. # --- 优化:引入强制休息机制 ---
  140. current_new_count = len(all_links) - initial_count
  141. if current_new_count > 0 and current_new_count % 15 == 0:
  142. rest_time = random.randint(15, 30)
  143. print(f"[*] 为了账号安全,强制休息 {rest_time} 秒...")
  144. time.sleep(rest_time)
  145. print(f" [>] 抓取详情: {clean_url}")
  146. # 进入详情页前的微睡眠
  147. time.sleep(random.uniform(1.5, 3.5))
  148. detail_results = self.scrape_detail(clean_url)
  149. if detail_results: page_batch.extend(detail_results)
  150. else: page_batch.append({"link": clean_url, "name": it["name"]})
  151. if len(page_batch) >= 10:
  152. yield page_batch
  153. page_batch = []
  154. # --- 优化:详情页之间的大幅随机等待 ---
  155. time.sleep(random.uniform(6, 12))
  156. if len(all_links) >= total_count + initial_count: break
  157. if page_batch: yield page_batch
  158. page += 1
  159. if page > 100: break
  160. return list(all_links)
  161. def scrape_detail(self, url):
  162. """ 精准抓取详情页 """
  163. try:
  164. self.driver.get(url)
  165. time.sleep(random.uniform(2.5, 4.5)) # 详情页加载等待
  166. self.check_for_captcha()
  167. model = self.driver.execute_script(
  168. "return (window.context && window.context.result && window.context.result.global && window.context.result.global.globalData && window.context.result.global.globalData.model) || window.__INITIAL_DATA__ || window.iDetailData || window.iDetailConfig || null;"
  169. )
  170. if not model: return None
  171. def get_attr(name):
  172. try:
  173. attrs = model.get("offerDetail", {}).get("featureAttributes", [])
  174. for item in attrs:
  175. if name in item.get("name", ""): return item.get("value", "")
  176. attrs = model.get("detailData", {}).get("attributes", [])
  177. for item in attrs:
  178. if name in item.get("attributeName", ""): return item.get("value", "")
  179. except: pass
  180. return ""
  181. trade = model.get("tradeModel", {}) if isinstance(model, dict) else {}
  182. price_min = trade.get("minPrice", "") or ""
  183. if not price_min:
  184. try: price_min = model["sku"]["priceRange"][0][1]
  185. except: pass
  186. ranges = trade.get("disPriceRanges") or trade.get("currentPrices") or []
  187. range_text = " / ".join([f"{r.get('beginAmount')}起 ¥{r.get('price') or r.get('discountPrice')}" for r in ranges])
  188. base_data = {
  189. "category": (model.get("offerDetail", {}).get("leafCategoryName", "") if isinstance(model, dict) else "") or self.driver.find_element(By.CSS_SELECTOR, "div[class*=breadcrumb] a:last-child").text.strip(),
  190. "brand": get_attr("品牌"),
  191. "name": (model.get("offerDetail", {}).get("subject", "") if isinstance(model, dict) else "") or self.driver.title.split('-')[0],
  192. "spec": get_attr("尺码") or get_attr("规格") or get_attr("型号"),
  193. "material": get_attr("材质") or get_attr("面料"),
  194. "price": price_min,
  195. "moq": trade.get("beginAmount", ""),
  196. "wholesale_price": range_text,
  197. "link": url,
  198. "supplier": (model.get("sellerModel", {}).get("companyName", "") if isinstance(model, dict) else ""),
  199. }
  200. sku_props = model.get("skuModel", {}).get("skuProps", []) or model.get("detailData", {}).get("skuProps", []) or []
  201. main_prop = next((p for p in sku_props if any(k in p.get("prop", "") for k in ["颜色", "分类", "款式", "花色"])), None)
  202. if not main_prop and sku_props: main_prop = sku_props[0]
  203. if main_prop and main_prop.get("value"):
  204. results = []
  205. for val in main_prop["value"]:
  206. if val.get("name"):
  207. row = base_data.copy()
  208. row["color"] = val.get("name")
  209. results.append(row)
  210. return results
  211. return [base_data]
  212. except: return None
  213. def _extract_all_methods(self):
  214. """ 列表页提取 """
  215. results = []
  216. try:
  217. res = self.driver.execute_script("return JSON.stringify(window.data || window.__INITIAL_DATA__)")
  218. if res:
  219. data = json.loads(res)
  220. def find_list(obj):
  221. if isinstance(obj, list) and len(obj) > 0 and ('title' in obj[0] or 'offerId' in obj[0]): return obj
  222. if isinstance(obj, dict):
  223. for k in obj:
  224. f = find_list(obj[k])
  225. if f: return f
  226. return None
  227. for o in (find_list(data) or []):
  228. link = o.get('itemUrl', o.get('url', ''))
  229. if link: results.append({"name": str(o.get('title', '')), "link": link})
  230. except: pass
  231. if not results:
  232. for s in [".search-offer-item", "[class*='offer-card']", ".offer-item"]:
  233. elements = self.driver.find_elements(By.CSS_SELECTOR, s)
  234. if len(elements) > 2:
  235. for el in elements:
  236. try:
  237. a = el.find_element(By.TAG_NAME, "a")
  238. link = a.get_attribute("href")
  239. if link: results.append({"name": el.text.split('\n')[0][:50], "link": link})
  240. except: continue
  241. if results: break
  242. if not results:
  243. ids = re.findall(r'data-offer-id="(\d+)"', self.driver.page_source)
  244. for oid in set(ids):
  245. results.append({"name": f"1688商品-{oid}", "link": f"https://detail.1688.com/offer/{oid}.html"})
  246. return results
  247. def quit(self):
  248. try: self.driver.quit()
  249. except: pass