import json import time from copy import copy from pathlib import Path from openpyxl import Workbook, load_workbook from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC DEBUG_ADDR = "127.0.0.1:9222" # 与启动 Chrome 时的端口一致 WAIT = 15 # 填入excel表格的路径 EXCEL_PATH = Path( r"C:\Users\Meng\PycharmProjects\PythonProject\【进价】产品信息空表.xlsx" ) COLUMNS = [ "编码", "品类", "品牌", "商品名称", "颜色", "规格尺码", "材质", "单品进价(元)", "moq(起订量)", "批发进价(元)", "产品链接", "供应商信息", ] def build_driver(): opts = Options() opts.add_experimental_option("debuggerAddress", DEBUG_ADDR) opts.add_argument("--start-maximized") # 若需要规避自动化特征可按需添加: opts.add_argument("--disable-blink-features=AutomationControlled") return webdriver.Chrome(options=opts) def human_wait(sec=1.2): time.sleep(sec) def scrape_item(driver, url): driver.get(url) WebDriverWait(driver, WAIT).until( EC.presence_of_element_located((By.TAG_NAME, "body")) ) human_wait() def safe_text(by, sel): try: return driver.find_element(by, sel).text.strip() except Exception: return "" # 1688 页面里 window.context.result.global.globalData.model 含完整商品数据 model = driver.execute_script( "return (window.context && window.context.result && " "window.context.result.global && window.context.result.global.globalData " "&& window.context.result.global.globalData.model) || null;" ) def get_attr(name): """从 featureAttributes 里取指定属性值""" try: attrs = model["offerDetail"]["featureAttributes"] for item in attrs: if item.get("name") == name: return item.get("value", "") except Exception: return "" return "" # 价格与 MOQ trade = model.get("tradeModel", {}) if model else {} price_min = trade.get("minPrice", "") or "" price_max = trade.get("maxPrice", "") or "" begin_amount = trade.get("beginAmount", "") # 批发价区间拼接 ranges = trade.get("disPriceRanges") or trade.get("currentPrices") or trade.get("offerPriceModel", {}).get("currentPrices", []) range_text = " / ".join( [f"{r.get('beginAmount')}起 ¥{r.get('price') or r.get('discountPrice')}" for r in ranges] ) if ranges else "" data = { "品类": (model.get("offerDetail", {}).get("leafCategoryName", "") if model else "") or safe_text(By.CSS_SELECTOR, "div[class*=breadcrumb] a:last-child"), "品牌": get_attr("品牌"), "商品名称": (model.get("offerDetail", {}).get("subject", "") if model else "") or safe_text(By.CSS_SELECTOR, "h1.d-title") or safe_text(By.CSS_SELECTOR, "h1[class*=title]"), "颜色": get_attr("颜色") or safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='颜色']/following-sibling::td[1]//span[@class='field-value']"), "规格尺码": get_attr("尺码") or safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='尺码']/following-sibling::td[1]//span[@class='field-value']"), "材质": get_attr("材质") or safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='材质']/following-sibling::td[1]//span[@class='field-value']"), "单品进价(元)": f"{price_min}-{price_max}" if price_min and price_max and price_min != price_max else f"{price_min}" if price_min else "", "moq(起订量)": begin_amount or safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='起订量']/following-sibling::td[1]//span[@class='field-value']"), "批发进价(元)": range_text, "产品链接": url, "供应商信息": (model.get("sellerModel", {}).get("companyName", "") if model else "") or safe_text(By.CSS_SELECTOR, "a.company-name") or safe_text(By.CSS_SELECTOR, "div.company-name"), } return data def save_to_excel(path: Path, rows: list[dict]): """ 将抓取结果追加写入已有格式的表格。 - 默认使用首个工作表,并基于模板行复制样式。 - 模板行:若存在第3行则用第3行样式,否则用第2行。 """ if path.exists(): wb = load_workbook(path) ws = wb.active else: wb = Workbook() ws = wb.active ws.append(COLUMNS) # 选择模板行(通常是设计好的第一行数据样式) template_row_idx = 3 if ws.max_row >= 3 else 2 if ws.max_row >= 2 else 1 template_row = ws[template_row_idx] # 找到首个“数据区”空行(除编码列外其余列为空),否则追加到末尾 data_cols = range(2, len(COLUMNS) + 1) # 跳过编码列 first_empty_row = None for r in range(template_row_idx + 1, ws.max_row + 1): if all((ws.cell(row=r, column=c).value in (None, "")) for c in data_cols): first_empty_row = r break insert_row = first_empty_row or (ws.max_row + 1) # 计算编码起始值(取首列已有最大数字) last_code = 0 for r in range(1, ws.max_row + 1): try: val = ws.cell(row=r, column=1).value if isinstance(val, (int, float)) and val > last_code: last_code = int(val) except Exception: continue next_code = last_code + 1 for row_data in rows: # 若目标行不存在,扩展表行数;若已存在空行,直接写入避免上移模板序号 if insert_row > ws.max_row: ws.append([None] * len(COLUMNS)) for col_idx, col_name in enumerate(COLUMNS, start=1): if col_name == "编码": value = row_data.get("编码", next_code) else: value = row_data.get(col_name, "") cell = ws.cell(row=insert_row, column=col_idx, value=value) # 复制模板样式 if col_idx <= len(template_row): tmpl = template_row[col_idx - 1] cell._style = copy(tmpl._style) next_code += 1 insert_row += 1 try: wb.save(path) except PermissionError: alt = path.with_name(f"{path.stem}_out_{int(time.time())}{path.suffix}") wb.save(alt) print(f"原文件被占用,已写入副本:{alt}") if __name__ == "__main__": ITEM_URLS = [ "https://detail.1688.com/offer/860913286492.html?src=zhanwai&pid=301011_0000&ptid=017700000007986632e2076d03b97563&exp=enquiry%3AB%3BqueryMobilePhone%3AC%3Bxlyx%3AB&_force_exp_buckets_=11803%2C2024061703%2C2024011602&spm=a312h.2018_new_sem.dh_002.1.f2803d1evDvWwN&cosite=baidujj_pz&tracelog=p4p&_p_isad=1&clickid=73a657ca198445129a0f9657e5b848a5&sessionid=a22bd65f97342308cefe01ce93b2fb30&a=1128&e=Do0-iFADT1PETFOTY8XkCkiE39RAE-Osyk6HP6xWf0u9yqFIKWxE-tv1Y3207LPZ5B-yn5Il1MSwxqPfQ8JvdyS98dA7jdxprtnbuiV4OUqnLO30gnz1.Xd-cgEt.XmRfbc0snn-075TsNtnLsryNlOtFDjT98D.4kjEePgSJhSsTCWyLsXVy0.5ucS4u.tGix-9aJf-M.TJFqBjAJ84c-ZtUAhSFMPxrFV7CUJXRsAfWkEYWS5X9RD2lAIIXvm1vGzlY7ihi3tvyEKFOnqcIOZfewv0q.6g&sk=sem&style=1", 'https://detail.1688.com/offer/711070382704.html?src=zhanwai&pid=301011_0000&ptid=017700000007986632e2076d03b97563&exp=enquiry%3AB%3BqueryMobilePhone%3AA%3Bxlyx%3AB&_force_exp_buckets_=11803%2C2024061701%2C2024011602&spm=a312h.2018_new_sem.dh_002.3.f2803d1eBPrwvm&cosite=baidujj_pz&tracelog=p4p&_p_isad=1&clickid=8f28c35e2dab4a02ac6ab7bde4d85d7e&sessionid=a22bd65f97342308cefe01ce93b2fb30&a=1143&e=zES-dp9sJJ3zrrk0AjWQ.80TNDrQDIxAfEp4OeWz-t73PxVXx2khSRkm4WW-Gc2mWvz9cqAP-EtjxMGkm81MROlxv-X4ECd0aflzSA.u7Sw3XqQVUjl1FuCMFIzXZNqVuhIhieuUBNzHeKI5kOSG2IZTlYUgrDcjviyzqR9QbTS1DK6Qg8fQSsEm2SWtGdfb.otTyBxd30qq6gPLLvqhYiW2FW.t4rGnK1eNs8ThmT6RNZF-ol9mriMaxAlPq2t2MvQtMFuvgdZWZh-v9vZHQsr2UDsUSUJ44V7XlLMFYIk_&sk=sem&style=1', # 替换为你的商品链接 ] driver = build_driver() try: results = [] for url in ITEM_URLS: info = scrape_item(driver, url) results.append(info) print(json.dumps(info, ensure_ascii=False, indent=2)) human_wait(1.5) # 控制节奏,避免频繁触发风控 save_to_excel(EXCEL_PATH, results) finally: driver.quit()