req.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. import json
  2. import time
  3. from copy import copy
  4. from pathlib import Path
  5. from openpyxl import Workbook, load_workbook
  6. from selenium import webdriver
  7. from selenium.webdriver.chrome.options import Options
  8. from selenium.webdriver.common.by import By
  9. from selenium.webdriver.support.ui import WebDriverWait
  10. from selenium.webdriver.support import expected_conditions as EC
  11. DEBUG_ADDR = "127.0.0.1:9222" # 与启动 Chrome 时的端口一致
  12. WAIT = 15
  13. # 填入excel表格的路径
  14. EXCEL_PATH = Path(
  15. r"C:\Users\Meng\PycharmProjects\PythonProject\【进价】产品信息空表.xlsx"
  16. )
  17. COLUMNS = [
  18. "编码",
  19. "品类",
  20. "品牌",
  21. "商品名称",
  22. "颜色",
  23. "规格尺码",
  24. "材质",
  25. "单品进价(元)",
  26. "moq(起订量)",
  27. "批发进价(元)",
  28. "产品链接",
  29. "供应商信息",
  30. ]
  31. def build_driver():
  32. opts = Options()
  33. opts.add_experimental_option("debuggerAddress", DEBUG_ADDR)
  34. opts.add_argument("--start-maximized")
  35. # 若需要规避自动化特征可按需添加:
  36. opts.add_argument("--disable-blink-features=AutomationControlled")
  37. return webdriver.Chrome(options=opts)
  38. def human_wait(sec=1.2):
  39. time.sleep(sec)
  40. def scrape_item(driver, url):
  41. driver.get(url)
  42. WebDriverWait(driver, WAIT).until(
  43. EC.presence_of_element_located((By.TAG_NAME, "body"))
  44. )
  45. human_wait()
  46. def safe_text(by, sel):
  47. try:
  48. return driver.find_element(by, sel).text.strip()
  49. except Exception:
  50. return ""
  51. # 1688 页面里 window.context.result.global.globalData.model 含完整商品数据
  52. model = driver.execute_script(
  53. "return (window.context && window.context.result && "
  54. "window.context.result.global && window.context.result.global.globalData "
  55. "&& window.context.result.global.globalData.model) || null;"
  56. )
  57. def get_attr(name):
  58. """从 featureAttributes 里取指定属性值"""
  59. try:
  60. attrs = model["offerDetail"]["featureAttributes"]
  61. for item in attrs:
  62. if item.get("name") == name:
  63. return item.get("value", "")
  64. except Exception:
  65. return ""
  66. return ""
  67. # 价格与 MOQ
  68. trade = model.get("tradeModel", {}) if model else {}
  69. price_min = trade.get("minPrice", "") or ""
  70. price_max = trade.get("maxPrice", "") or ""
  71. begin_amount = trade.get("beginAmount", "")
  72. # 批发价区间拼接
  73. ranges = trade.get("disPriceRanges") or trade.get("currentPrices") or trade.get("offerPriceModel", {}).get("currentPrices", [])
  74. range_text = " / ".join(
  75. [f"{r.get('beginAmount')}起 ¥{r.get('price') or r.get('discountPrice')}" for r in ranges]
  76. ) if ranges else ""
  77. data = {
  78. "品类": (model.get("offerDetail", {}).get("leafCategoryName", "") if model else "")
  79. or safe_text(By.CSS_SELECTOR, "div[class*=breadcrumb] a:last-child"),
  80. "品牌": get_attr("品牌"),
  81. "商品名称": (model.get("offerDetail", {}).get("subject", "") if model else "")
  82. or safe_text(By.CSS_SELECTOR, "h1.d-title")
  83. or safe_text(By.CSS_SELECTOR, "h1[class*=title]"),
  84. "颜色": get_attr("颜色") or safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='颜色']/following-sibling::td[1]//span[@class='field-value']"),
  85. "规格尺码": get_attr("尺码") or safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='尺码']/following-sibling::td[1]//span[@class='field-value']"),
  86. "材质": get_attr("材质") or safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='材质']/following-sibling::td[1]//span[@class='field-value']"),
  87. "单品进价(元)": f"{price_min}-{price_max}" if price_min and price_max and price_min != price_max else f"{price_min}" if price_min else "",
  88. "moq(起订量)": begin_amount or safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='起订量']/following-sibling::td[1]//span[@class='field-value']"),
  89. "批发进价(元)": range_text,
  90. "产品链接": url,
  91. "供应商信息": (model.get("sellerModel", {}).get("companyName", "") if model else "")
  92. or safe_text(By.CSS_SELECTOR, "a.company-name")
  93. or safe_text(By.CSS_SELECTOR, "div.company-name"),
  94. }
  95. return data
  96. def save_to_excel(path: Path, rows: list[dict]):
  97. """
  98. 将抓取结果追加写入已有格式的表格。
  99. - 默认使用首个工作表,并基于模板行复制样式。
  100. - 模板行:若存在第3行则用第3行样式,否则用第2行。
  101. """
  102. if path.exists():
  103. wb = load_workbook(path)
  104. ws = wb.active
  105. else:
  106. wb = Workbook()
  107. ws = wb.active
  108. ws.append(COLUMNS)
  109. # 选择模板行(通常是设计好的第一行数据样式)
  110. template_row_idx = 3 if ws.max_row >= 3 else 2 if ws.max_row >= 2 else 1
  111. template_row = ws[template_row_idx]
  112. # 找到首个“数据区”空行(除编码列外其余列为空),否则追加到末尾
  113. data_cols = range(2, len(COLUMNS) + 1) # 跳过编码列
  114. first_empty_row = None
  115. for r in range(template_row_idx + 1, ws.max_row + 1):
  116. if all((ws.cell(row=r, column=c).value in (None, "")) for c in data_cols):
  117. first_empty_row = r
  118. break
  119. insert_row = first_empty_row or (ws.max_row + 1)
  120. # 计算编码起始值(取首列已有最大数字)
  121. last_code = 0
  122. for r in range(1, ws.max_row + 1):
  123. try:
  124. val = ws.cell(row=r, column=1).value
  125. if isinstance(val, (int, float)) and val > last_code:
  126. last_code = int(val)
  127. except Exception:
  128. continue
  129. next_code = last_code + 1
  130. for row_data in rows:
  131. # 若目标行不存在,扩展表行数;若已存在空行,直接写入避免上移模板序号
  132. if insert_row > ws.max_row:
  133. ws.append([None] * len(COLUMNS))
  134. for col_idx, col_name in enumerate(COLUMNS, start=1):
  135. if col_name == "编码":
  136. value = row_data.get("编码", next_code)
  137. else:
  138. value = row_data.get(col_name, "")
  139. cell = ws.cell(row=insert_row, column=col_idx, value=value)
  140. # 复制模板样式
  141. if col_idx <= len(template_row):
  142. tmpl = template_row[col_idx - 1]
  143. cell._style = copy(tmpl._style)
  144. next_code += 1
  145. insert_row += 1
  146. try:
  147. wb.save(path)
  148. except PermissionError:
  149. alt = path.with_name(f"{path.stem}_out_{int(time.time())}{path.suffix}")
  150. wb.save(alt)
  151. print(f"原文件被占用,已写入副本:{alt}")
  152. if __name__ == "__main__":
  153. ITEM_URLS = [
  154. "https://detail.1688.com/offer/860913286492.html?src=zhanwai&pid=301011_0000&ptid=017700000007986632e2076d03b97563&exp=enquiry%3AB%3BqueryMobilePhone%3AC%3Bxlyx%3AB&_force_exp_buckets_=11803%2C2024061703%2C2024011602&spm=a312h.2018_new_sem.dh_002.1.f2803d1evDvWwN&cosite=baidujj_pz&tracelog=p4p&_p_isad=1&clickid=73a657ca198445129a0f9657e5b848a5&sessionid=a22bd65f97342308cefe01ce93b2fb30&a=1128&e=Do0-iFADT1PETFOTY8XkCkiE39RAE-Osyk6HP6xWf0u9yqFIKWxE-tv1Y3207LPZ5B-yn5Il1MSwxqPfQ8JvdyS98dA7jdxprtnbuiV4OUqnLO30gnz1.Xd-cgEt.XmRfbc0snn-075TsNtnLsryNlOtFDjT98D.4kjEePgSJhSsTCWyLsXVy0.5ucS4u.tGix-9aJf-M.TJFqBjAJ84c-ZtUAhSFMPxrFV7CUJXRsAfWkEYWS5X9RD2lAIIXvm1vGzlY7ihi3tvyEKFOnqcIOZfewv0q.6g&sk=sem&style=1",
  155. 'https://detail.1688.com/offer/711070382704.html?src=zhanwai&pid=301011_0000&ptid=017700000007986632e2076d03b97563&exp=enquiry%3AB%3BqueryMobilePhone%3AA%3Bxlyx%3AB&_force_exp_buckets_=11803%2C2024061701%2C2024011602&spm=a312h.2018_new_sem.dh_002.3.f2803d1eBPrwvm&cosite=baidujj_pz&tracelog=p4p&_p_isad=1&clickid=8f28c35e2dab4a02ac6ab7bde4d85d7e&sessionid=a22bd65f97342308cefe01ce93b2fb30&a=1143&e=zES-dp9sJJ3zrrk0AjWQ.80TNDrQDIxAfEp4OeWz-t73PxVXx2khSRkm4WW-Gc2mWvz9cqAP-EtjxMGkm81MROlxv-X4ECd0aflzSA.u7Sw3XqQVUjl1FuCMFIzXZNqVuhIhieuUBNzHeKI5kOSG2IZTlYUgrDcjviyzqR9QbTS1DK6Qg8fQSsEm2SWtGdfb.otTyBxd30qq6gPLLvqhYiW2FW.t4rGnK1eNs8ThmT6RNZF-ol9mriMaxAlPq2t2MvQtMFuvgdZWZh-v9vZHQsr2UDsUSUJ44V7XlLMFYIk_&sk=sem&style=1',
  156. # 替换为你的商品链接
  157. ]
  158. driver = build_driver()
  159. try:
  160. results = []
  161. for url in ITEM_URLS:
  162. info = scrape_item(driver, url)
  163. results.append(info)
  164. print(json.dumps(info, ensure_ascii=False, indent=2))
  165. human_wait(1.5) # 控制节奏,避免频繁触发风控
  166. save_to_excel(EXCEL_PATH, results)
  167. finally:
  168. driver.quit()