scraper.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388
  1. # 针对 Python 3.12+ 移除 distutils 的兼容性补丁
  2. import sys
  3. try:
  4. import distutils
  5. except ImportError:
  6. from types import ModuleType
  7. d, v = ModuleType("distutils"), ModuleType("distutils.version")
  8. d.version = v
  9. sys.modules.update({"distutils": d, "distutils.version": v})
  10. class LooseVersion:
  11. def __init__(self, v): self.v = v
  12. def __lt__(self, o): return True
  13. def __str__(self): return str(self.v)
  14. v.LooseVersion = LooseVersion
  15. import time, random, re, os, subprocess, urllib.parse, json, traceback
  16. import undetected_chromedriver as uc
  17. from selenium.webdriver.common.by import By
  18. from selenium.webdriver.common.action_chains import ActionChains
  19. from selenium_stealth import stealth
  20. class Scraper1688:
  21. def __init__(self, headless=True, status_callback=None):
  22. self.headless = headless
  23. self.status_callback = status_callback # 用于回调 GUI 状态
  24. self.user_data_path = os.path.abspath(os.path.join(os.getcwd(), "1688_user_data"))
  25. self._cleanup()
  26. def create_options():
  27. options = uc.ChromeOptions()
  28. options.add_argument(f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36')
  29. options.add_argument(f"--user-data-dir={self.user_data_path}")
  30. if headless: options.add_argument('--headless=new')
  31. options.add_argument('--disable-blink-features=AutomationControlled')
  32. options.add_argument("--window-size=1920,1080")
  33. return options
  34. try:
  35. # 关键修复:每次启动都使用 create_options() 产生的全新对象
  36. self.driver = uc.Chrome(options=create_options(), headless=headless, version_main=131)
  37. except:
  38. # 关键修复:这里也要用全新的 options 对象
  39. self.driver = uc.Chrome(options=create_options(), headless=headless)
  40. stealth(self.driver, languages=["zh-CN", "zh"], vendor="Google Inc.", platform="Win32", fix_hairline=True)
  41. def _cleanup(self):
  42. if os.name == 'nt': subprocess.call(['taskkill', '/F', '/IM', 'chrome.exe', '/T'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
  43. if os.path.exists(self.user_data_path):
  44. for root, _, files in os.walk(self.user_data_path):
  45. for f in files:
  46. if "lock" in f.lower() or f == "SingletonLock":
  47. try: os.remove(os.path.join(root, f))
  48. except: pass
  49. def clean_url(self, url):
  50. """极其鲁棒的 1688 URL 清洗逻辑"""
  51. if not url: return ""
  52. if url.startswith("//"): url = "https:" + url
  53. # 1. 尝试从路径中匹配 offer ID (标准 PC 链接)
  54. id_match = re.search(r'offer/(\d+)\.html', url)
  55. if id_match: return f"https://detail.1688.com/offer/{id_match.group(1)}.html"
  56. # 2. 尝试从查询参数中提取 offerId (移动端或广告链接)
  57. parsed = urllib.parse.urlparse(url)
  58. params = urllib.parse.parse_qs(parsed.query)
  59. oid = params.get('offerId') or params.get('id')
  60. if oid: return f"https://detail.1688.com/offer/{oid[0]}.html"
  61. # 3. 针对某些特殊加密链接,尝试寻找 data-aplus-report 或类似字符串中的 ID
  62. id_match_report = re.search(r'object_id@(\d+)', url)
  63. if id_match_report: return f"https://detail.1688.com/offer/{id_match_report.group(1)}.html"
  64. return url
  65. def check_for_captcha(self):
  66. """
  67. 核心监控:检测登录、滑块验证、访问受限等需要人工干预的状态
  68. """
  69. def is_blocked():
  70. try:
  71. url = self.driver.current_url.lower()
  72. src = self.driver.page_source.lower()
  73. title = self.driver.title.lower()
  74. # 1. 检测滑块验证码
  75. sliders = self.driver.find_elements(By.ID, "nc_1_n1z")
  76. is_slider = len(sliders) > 0 and sliders[0].is_displayed()
  77. # 2. 检测登录页面 (如果跳转到了登录页)
  78. is_login = "login.1688.com" in url or "passport.1688.com" in url
  79. # 3. 检测惩罚/验证提示页
  80. is_punish = "punish" in url or "哎哟喂" in src or "验证码" in title or "验证提示" in title
  81. # 4. 检测是否被登出 (如果页面包含登录按钮且当前是详情/搜索页)
  82. # 这部分可以根据实际情况增强,目前主要靠 URL 判定
  83. return is_slider or is_login or is_punish
  84. except:
  85. return False
  86. if is_blocked():
  87. msg = "请登录验证"
  88. print(f"\n[!] {msg}...")
  89. if self.status_callback:
  90. self.status_callback(True, msg)
  91. # 持续监控,直到上述所有拦截状态消失
  92. while is_blocked():
  93. time.sleep(2)
  94. if self.status_callback:
  95. self.status_callback(False, "验证通过")
  96. print("\n[OK] 监测到人工干预已完成,3秒后恢复自动抓取...")
  97. time.sleep(3)
  98. return True
  99. # def search_products_yield(self, keyword, total_count=200):
  100. def search_products_yield(self, keyword, total_count=200, existing_links=None):
  101. gbk_keyword = urllib.parse.quote(keyword, encoding='gbk')
  102. base_url = f"https://s.1688.com/selloffer/offer_search.htm?keywords={gbk_keyword}&n=y&netType=1%2C11%2C16"
  103. # 初始检查:确保在开始抓取前没被拦截(比如没登录)
  104. self.driver.get("https://www.1688.com")
  105. self.check_for_captcha()
  106. all_links = existing_links if existing_links is not None else set()
  107. page = 1
  108. consecutive_empty_pages = 0
  109. # 记录初始抓取的链接数,用于计算进度
  110. initial_count = len(all_links)
  111. while len(all_links) < total_count + initial_count and consecutive_empty_pages < 3:
  112. print(f"[*] 正在搜索列表页: 第 {page} 页...")
  113. target_url = f"{base_url}&beginPage={page}&page={page}"
  114. self.driver.get(target_url)
  115. # 关键:首屏强制等待渲染
  116. time.sleep(5)
  117. self.check_for_captcha()
  118. # 深度滚动确保加载
  119. for i in range(1, 4):
  120. self.driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {i/3});")
  121. time.sleep(1)
  122. page_results = self._extract_all_methods()
  123. page_batch = []
  124. for it in page_results:
  125. clean_url = self.clean_url(it["link"])
  126. if clean_url and clean_url not in all_links:
  127. all_links.add(clean_url)
  128. # 核心改进:进入详情页抓取精准数据
  129. print(f" [>] 抓取详情: {clean_url}")
  130. detail_results = self.scrape_detail(clean_url)
  131. if detail_results:
  132. # detail_results 现在是一个列表 (包含多个颜色分类)
  133. page_batch.extend(detail_results)
  134. else:
  135. # 兜底
  136. it["link"] = clean_url
  137. page_batch.append({
  138. "category": "", "brand": "", "name": it["name"],
  139. "color": "", "spec": "", "material": "", "price": it["price"],
  140. "moq": "", "wholesale_price": "", "link": clean_url, "supplier": ""
  141. })
  142. # 每满 10 条 yield 一次
  143. if len(page_batch) >= 10:
  144. yield page_batch
  145. page_batch = []
  146. # 详情页抓取后的随机等待
  147. time.sleep(random.uniform(2, 4))
  148. if len(all_links) >= total_count + initial_count:
  149. break
  150. # 每页结束,将不足 10 条的余数 yield 出去
  151. if page_batch:
  152. yield page_batch
  153. page_batch = []
  154. page += 1
  155. if len(all_links) < total_count + initial_count:
  156. print(f"[*] 累计已处理新链接: {len(all_links) - initial_count} 条,准备翻下一页...")
  157. time.sleep(3)
  158. return list(all_links)
  159. def scrape_detail(self, url):
  160. """
  161. 根据 /refe/req.py 订正的详情页抓取逻辑
  162. 获取极其精准的商品属性和价格数据,并支持将“颜色分类”拆分为多行
  163. """
  164. try:
  165. self.driver.get(url)
  166. time.sleep(2)
  167. self.check_for_captcha()
  168. # 执行 JS 获取 1688 详情页背后的完整数据模型
  169. model = self.driver.execute_script(
  170. "return (window.context && window.context.result && "
  171. "window.context.result.global && window.context.result.global.globalData "
  172. "&& window.context.result.global.globalData.model) || "
  173. "window.__INITIAL_DATA__ || window.iDetailData || window.iDetailConfig || null;"
  174. )
  175. if not model:
  176. return None
  177. def get_attr(name):
  178. """从 featureAttributes 里取指定属性值"""
  179. try:
  180. # 现代版
  181. attrs = model.get("offerDetail", {}).get("featureAttributes", [])
  182. for item in attrs:
  183. if name in item.get("name", ""): return item.get("value", "")
  184. # 老版兼容
  185. attrs = model.get("detailData", {}).get("attributes", [])
  186. for item in attrs:
  187. if name in item.get("attributeName", ""): return item.get("value", "")
  188. except: pass
  189. return ""
  190. def safe_text(by, sel):
  191. try:
  192. return self.driver.find_element(by, sel).text.strip()
  193. except: return ""
  194. # 价格处理逻辑
  195. trade = model.get("tradeModel", {}) if isinstance(model, dict) else {}
  196. price_min = trade.get("minPrice", "") or ""
  197. price_max = trade.get("maxPrice", "") or ""
  198. # 老版价格补丁
  199. if not price_min:
  200. try: price_min = model["sku"]["priceRange"][0][1]
  201. except: pass
  202. begin_amount = trade.get("beginAmount", "")
  203. # 批发价区间
  204. ranges = trade.get("disPriceRanges") or trade.get("currentPrices") or \
  205. trade.get("offerPriceModel", {}).get("currentPrices", [])
  206. range_text = " / ".join(
  207. [f"{r.get('beginAmount')}起 ¥{r.get('price') or r.get('discountPrice')}" for r in ranges]
  208. ) if ranges else ""
  209. # 基础数据模板
  210. base_data = {
  211. "category": (model.get("offerDetail", {}).get("leafCategoryName", "") if isinstance(model, dict) else "")
  212. or safe_text(By.CSS_SELECTOR, "div[class*=breadcrumb] a:last-child"),
  213. "brand": get_attr("品牌"),
  214. "name": (model.get("offerDetail", {}).get("subject", "") if isinstance(model, dict) else "")
  215. or safe_text(By.CSS_SELECTOR, "h1.d-title")
  216. or safe_text(By.CSS_SELECTOR, "h1[class*=title]"),
  217. "color": "", # 待填充
  218. "spec": get_attr("尺码") or get_attr("规格") or get_attr("型号") or \
  219. safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='尺码' or span='规格']/following-sibling::td[1]//span[@class='field-value']"),
  220. "material": get_attr("材质") or get_attr("面料") or \
  221. safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='材质']/following-sibling::td[1]//span[@class='field-value']"),
  222. "price": f"{price_min}-{price_max}" if price_min and price_max and price_min != price_max else f"{price_min}" if price_min else "",
  223. "moq": begin_amount or safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='起订量' or span='起批量']/following-sibling::td[1]//span[@class='field-value']"),
  224. "wholesale_price": range_text,
  225. "link": url,
  226. "supplier": (model.get("sellerModel", {}).get("companyName", "") if isinstance(model, dict) else "")
  227. or safe_text(By.CSS_SELECTOR, "a.company-name")
  228. or safe_text(By.CSS_SELECTOR, "div.company-name"),
  229. }
  230. # --- 核心逻辑:拆分规格/颜色分类 ---
  231. sku_props = []
  232. try:
  233. # 尝试多种路径获取 SKU 属性
  234. sku_props = model.get("skuModel", {}).get("skuProps", []) or \
  235. model.get("detailData", {}).get("skuProps", []) or \
  236. model.get("sku", {}).get("skuProps", [])
  237. except: pass
  238. # 智能寻找主维度:
  239. # 1. 优先找包含“颜色”、“分类”、“款式”、“花色”的维度
  240. # 2. 如果没有,则取第一个 SKU 维度(例如“净含量”、“规格”等)
  241. main_prop = None
  242. if sku_props:
  243. main_prop = next((p for p in sku_props if any(k in p.get("prop", "") for k in ["颜色", "分类", "款式", "花色"])), None)
  244. if not main_prop:
  245. main_prop = sku_props[0]
  246. if main_prop and main_prop.get("value"):
  247. variant_results = []
  248. for val in main_prop["value"]:
  249. # 只有当该分类确实有名字时才记录
  250. variant_name = val.get("name")
  251. if variant_name:
  252. row = base_data.copy()
  253. row["color"] = variant_name
  254. variant_results.append(row)
  255. return variant_results
  256. else:
  257. # 兜底:如果没有发现规格选择区,则获取单属性颜色
  258. base_data["color"] = get_attr("颜色") or get_attr("颜色分类") or ""
  259. return [base_data]
  260. except Exception as e:
  261. print(f"[!] 详情页抓取异常 ({url}): {e}")
  262. return None
  263. except Exception as e:
  264. print(f"[!] 详情页抓取异常 ({url}): {e}")
  265. return None
  266. def _extract_all_methods(self):
  267. """三位一体提取法:JSON + DOM + 深度搜索"""
  268. results = []
  269. # 1. JSON 提取 (window.data 或 window.__INITIAL_DATA__)
  270. try:
  271. res = self.driver.execute_script("return JSON.stringify(window.data || window.__INITIAL_DATA__)")
  272. if res:
  273. data = json.loads(res)
  274. def find_list(obj):
  275. if isinstance(obj, list) and len(obj) > 0:
  276. if 'title' in obj[0] or 'offerId' in obj[0]: return obj
  277. if isinstance(obj, dict):
  278. for k in obj:
  279. found = find_list(obj[k])
  280. if found: return found
  281. return None
  282. raw = find_list(data) or []
  283. for o in raw:
  284. title = str(o.get('title', o.get('name', ''))).replace('<em>','').replace('</em>','')
  285. link = o.get('itemUrl', o.get('url', ''))
  286. price = o.get('priceInfo', {}).get('price', o.get('price', '面议'))
  287. if link: results.append({"name": title, "link": link, "price": price})
  288. except: pass
  289. # 2. 增强版 DOM 扫描
  290. if not results:
  291. # 包含最新的选择器
  292. selectors = [".search-offer-item", "[class*='offer-card']", ".offer-item", ".major-offer"]
  293. for s in selectors:
  294. cards = self.driver.find_elements(By.CSS_SELECTOR, s)
  295. if len(cards) > 3:
  296. for el in cards:
  297. try:
  298. # 1. 链接提取:自身或子孙节点
  299. link = ""
  300. if el.tag_name == 'a':
  301. link = el.get_attribute("href")
  302. else:
  303. a_tags = el.find_elements(By.TAG_NAME, "a")
  304. for a in a_tags:
  305. h = a.get_attribute("href")
  306. if h and ("offer" in h or "item" in h or "ci_bb" in h):
  307. link = h; break
  308. # 2. ID 补丁
  309. if not link or "1688.com" not in link:
  310. oid = el.get_attribute("data-offer-id") or el.get_attribute("data-id")
  311. if oid: link = f"https://detail.1688.com/offer/{oid}.html"
  312. if link:
  313. # 3. 标题和价格提取
  314. title = el.text.split('\n')[0][:50]
  315. price = "面议"
  316. try:
  317. price_el = el.find_element(By.CSS_SELECTOR, ".text-main, [class*='price'], .amount")
  318. price = price_el.text.strip().replace("¥", "")
  319. except: pass
  320. results.append({"name": title, "link": link, "price": price})
  321. except: continue
  322. if results: break # 成功一次就不再尝试其他选择器
  323. # 3. 最后的保底:正则源码提取 (极其暴力)
  324. if not results:
  325. ids = re.findall(r'data-offer-id="(\d+)"', self.driver.page_source)
  326. for oid in set(ids):
  327. results.append({"name": f"1688商品-{oid}", "link": f"https://detail.1688.com/offer/{oid}.html", "price": "面议"})
  328. return results
  329. def quit(self):
  330. try: self.driver.quit()
  331. except: pass