| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191 |
- import json
- import time
- from copy import copy
- from pathlib import Path
- from openpyxl import Workbook, load_workbook
- from selenium import webdriver
- from selenium.webdriver.chrome.options import Options
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.support import expected_conditions as EC
- DEBUG_ADDR = "127.0.0.1:9222" # 与启动 Chrome 时的端口一致
- WAIT = 15
- # 填入excel表格的路径
- EXCEL_PATH = Path(
- r"C:\Users\Meng\PycharmProjects\PythonProject\【进价】产品信息空表.xlsx"
- )
- COLUMNS = [
- "编码",
- "品类",
- "品牌",
- "商品名称",
- "颜色",
- "规格尺码",
- "材质",
- "单品进价(元)",
- "moq(起订量)",
- "批发进价(元)",
- "产品链接",
- "供应商信息",
- ]
- def build_driver():
- opts = Options()
- opts.add_experimental_option("debuggerAddress", DEBUG_ADDR)
- opts.add_argument("--start-maximized")
- # 若需要规避自动化特征可按需添加:
- opts.add_argument("--disable-blink-features=AutomationControlled")
- return webdriver.Chrome(options=opts)
- def human_wait(sec=1.2):
- time.sleep(sec)
- def scrape_item(driver, url):
- driver.get(url)
- WebDriverWait(driver, WAIT).until(
- EC.presence_of_element_located((By.TAG_NAME, "body"))
- )
- human_wait()
- def safe_text(by, sel):
- try:
- return driver.find_element(by, sel).text.strip()
- except Exception:
- return ""
- # 1688 页面里 window.context.result.global.globalData.model 含完整商品数据
- model = driver.execute_script(
- "return (window.context && window.context.result && "
- "window.context.result.global && window.context.result.global.globalData "
- "&& window.context.result.global.globalData.model) || null;"
- )
- def get_attr(name):
- """从 featureAttributes 里取指定属性值"""
- try:
- attrs = model["offerDetail"]["featureAttributes"]
- for item in attrs:
- if item.get("name") == name:
- return item.get("value", "")
- except Exception:
- return ""
- return ""
- # 价格与 MOQ
- trade = model.get("tradeModel", {}) if model else {}
- price_min = trade.get("minPrice", "") or ""
- price_max = trade.get("maxPrice", "") or ""
- begin_amount = trade.get("beginAmount", "")
- # 批发价区间拼接
- ranges = trade.get("disPriceRanges") or trade.get("currentPrices") or trade.get("offerPriceModel", {}).get("currentPrices", [])
- range_text = " / ".join(
- [f"{r.get('beginAmount')}起 ¥{r.get('price') or r.get('discountPrice')}" for r in ranges]
- ) if ranges else ""
- data = {
- "品类": (model.get("offerDetail", {}).get("leafCategoryName", "") if model else "")
- or safe_text(By.CSS_SELECTOR, "div[class*=breadcrumb] a:last-child"),
- "品牌": get_attr("品牌"),
- "商品名称": (model.get("offerDetail", {}).get("subject", "") if model else "")
- or safe_text(By.CSS_SELECTOR, "h1.d-title")
- or safe_text(By.CSS_SELECTOR, "h1[class*=title]"),
- "颜色": get_attr("颜色") or safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='颜色']/following-sibling::td[1]//span[@class='field-value']"),
- "规格尺码": get_attr("尺码") or safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='尺码']/following-sibling::td[1]//span[@class='field-value']"),
- "材质": get_attr("材质") or safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='材质']/following-sibling::td[1]//span[@class='field-value']"),
- "单品进价(元)": f"{price_min}-{price_max}" if price_min and price_max and price_min != price_max else f"{price_min}" if price_min else "",
- "moq(起订量)": begin_amount or safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='起订量']/following-sibling::td[1]//span[@class='field-value']"),
- "批发进价(元)": range_text,
- "产品链接": url,
- "供应商信息": (model.get("sellerModel", {}).get("companyName", "") if model else "")
- or safe_text(By.CSS_SELECTOR, "a.company-name")
- or safe_text(By.CSS_SELECTOR, "div.company-name"),
- }
- return data
- def save_to_excel(path: Path, rows: list[dict]):
- """
- 将抓取结果追加写入已有格式的表格。
- - 默认使用首个工作表,并基于模板行复制样式。
- - 模板行:若存在第3行则用第3行样式,否则用第2行。
- """
- if path.exists():
- wb = load_workbook(path)
- ws = wb.active
- else:
- wb = Workbook()
- ws = wb.active
- ws.append(COLUMNS)
- # 选择模板行(通常是设计好的第一行数据样式)
- template_row_idx = 3 if ws.max_row >= 3 else 2 if ws.max_row >= 2 else 1
- template_row = ws[template_row_idx]
- # 找到首个“数据区”空行(除编码列外其余列为空),否则追加到末尾
- data_cols = range(2, len(COLUMNS) + 1) # 跳过编码列
- first_empty_row = None
- for r in range(template_row_idx + 1, ws.max_row + 1):
- if all((ws.cell(row=r, column=c).value in (None, "")) for c in data_cols):
- first_empty_row = r
- break
- insert_row = first_empty_row or (ws.max_row + 1)
- # 计算编码起始值(取首列已有最大数字)
- last_code = 0
- for r in range(1, ws.max_row + 1):
- try:
- val = ws.cell(row=r, column=1).value
- if isinstance(val, (int, float)) and val > last_code:
- last_code = int(val)
- except Exception:
- continue
- next_code = last_code + 1
- for row_data in rows:
- # 若目标行不存在,扩展表行数;若已存在空行,直接写入避免上移模板序号
- if insert_row > ws.max_row:
- ws.append([None] * len(COLUMNS))
- for col_idx, col_name in enumerate(COLUMNS, start=1):
- if col_name == "编码":
- value = row_data.get("编码", next_code)
- else:
- value = row_data.get(col_name, "")
- cell = ws.cell(row=insert_row, column=col_idx, value=value)
- # 复制模板样式
- if col_idx <= len(template_row):
- tmpl = template_row[col_idx - 1]
- cell._style = copy(tmpl._style)
- next_code += 1
- insert_row += 1
- try:
- wb.save(path)
- except PermissionError:
- alt = path.with_name(f"{path.stem}_out_{int(time.time())}{path.suffix}")
- wb.save(alt)
- print(f"原文件被占用,已写入副本:{alt}")
- if __name__ == "__main__":
- ITEM_URLS = [
- "https://detail.1688.com/offer/860913286492.html?src=zhanwai&pid=301011_0000&ptid=017700000007986632e2076d03b97563&exp=enquiry%3AB%3BqueryMobilePhone%3AC%3Bxlyx%3AB&_force_exp_buckets_=11803%2C2024061703%2C2024011602&spm=a312h.2018_new_sem.dh_002.1.f2803d1evDvWwN&cosite=baidujj_pz&tracelog=p4p&_p_isad=1&clickid=73a657ca198445129a0f9657e5b848a5&sessionid=a22bd65f97342308cefe01ce93b2fb30&a=1128&e=Do0-iFADT1PETFOTY8XkCkiE39RAE-Osyk6HP6xWf0u9yqFIKWxE-tv1Y3207LPZ5B-yn5Il1MSwxqPfQ8JvdyS98dA7jdxprtnbuiV4OUqnLO30gnz1.Xd-cgEt.XmRfbc0snn-075TsNtnLsryNlOtFDjT98D.4kjEePgSJhSsTCWyLsXVy0.5ucS4u.tGix-9aJf-M.TJFqBjAJ84c-ZtUAhSFMPxrFV7CUJXRsAfWkEYWS5X9RD2lAIIXvm1vGzlY7ihi3tvyEKFOnqcIOZfewv0q.6g&sk=sem&style=1",
- 'https://detail.1688.com/offer/711070382704.html?src=zhanwai&pid=301011_0000&ptid=017700000007986632e2076d03b97563&exp=enquiry%3AB%3BqueryMobilePhone%3AA%3Bxlyx%3AB&_force_exp_buckets_=11803%2C2024061701%2C2024011602&spm=a312h.2018_new_sem.dh_002.3.f2803d1eBPrwvm&cosite=baidujj_pz&tracelog=p4p&_p_isad=1&clickid=8f28c35e2dab4a02ac6ab7bde4d85d7e&sessionid=a22bd65f97342308cefe01ce93b2fb30&a=1143&e=zES-dp9sJJ3zrrk0AjWQ.80TNDrQDIxAfEp4OeWz-t73PxVXx2khSRkm4WW-Gc2mWvz9cqAP-EtjxMGkm81MROlxv-X4ECd0aflzSA.u7Sw3XqQVUjl1FuCMFIzXZNqVuhIhieuUBNzHeKI5kOSG2IZTlYUgrDcjviyzqR9QbTS1DK6Qg8fQSsEm2SWtGdfb.otTyBxd30qq6gPLLvqhYiW2FW.t4rGnK1eNs8ThmT6RNZF-ol9mriMaxAlPq2t2MvQtMFuvgdZWZh-v9vZHQsr2UDsUSUJ44V7XlLMFYIk_&sk=sem&style=1',
- # 替换为你的商品链接
- ]
- driver = build_driver()
- try:
- results = []
- for url in ITEM_URLS:
- info = scrape_item(driver, url)
- results.append(info)
- print(json.dumps(info, ensure_ascii=False, indent=2))
- human_wait(1.5) # 控制节奏,避免频繁触发风控
- save_to_excel(EXCEL_PATH, results)
- finally:
- driver.quit()
|