|
|
@@ -0,0 +1,191 @@
|
|
|
+import json
|
|
|
+import time
|
|
|
+from copy import copy
|
|
|
+from pathlib import Path
|
|
|
+
|
|
|
+from openpyxl import Workbook, load_workbook
|
|
|
+from selenium import webdriver
|
|
|
+from selenium.webdriver.chrome.options import Options
|
|
|
+from selenium.webdriver.common.by import By
|
|
|
+from selenium.webdriver.support.ui import WebDriverWait
|
|
|
+from selenium.webdriver.support import expected_conditions as EC
|
|
|
+
|
|
|
+DEBUG_ADDR = "127.0.0.1:9222" # 与启动 Chrome 时的端口一致
|
|
|
+WAIT = 15
|
|
|
+
|
|
|
+# 填入excel表格的路径
|
|
|
+EXCEL_PATH = Path(
|
|
|
+ r"C:\Users\Meng\PycharmProjects\PythonProject\【进价】产品信息空表.xlsx"
|
|
|
+)
|
|
|
+
|
|
|
+
|
|
|
+COLUMNS = [
|
|
|
+ "编码",
|
|
|
+ "品类",
|
|
|
+ "品牌",
|
|
|
+ "商品名称",
|
|
|
+ "颜色",
|
|
|
+ "规格尺码",
|
|
|
+ "材质",
|
|
|
+ "单品进价(元)",
|
|
|
+ "moq(起订量)",
|
|
|
+ "批发进价(元)",
|
|
|
+ "产品链接",
|
|
|
+ "供应商信息",
|
|
|
+]
|
|
|
+
|
|
|
+def build_driver():
|
|
|
+ opts = Options()
|
|
|
+ opts.add_experimental_option("debuggerAddress", DEBUG_ADDR)
|
|
|
+ opts.add_argument("--start-maximized")
|
|
|
+ # 若需要规避自动化特征可按需添加:
|
|
|
+ opts.add_argument("--disable-blink-features=AutomationControlled")
|
|
|
+ return webdriver.Chrome(options=opts)
|
|
|
+
|
|
|
+def human_wait(sec=1.2):
|
|
|
+ time.sleep(sec)
|
|
|
+
|
|
|
+def scrape_item(driver, url):
|
|
|
+ driver.get(url)
|
|
|
+ WebDriverWait(driver, WAIT).until(
|
|
|
+ EC.presence_of_element_located((By.TAG_NAME, "body"))
|
|
|
+ )
|
|
|
+ human_wait()
|
|
|
+
|
|
|
+ def safe_text(by, sel):
|
|
|
+ try:
|
|
|
+ return driver.find_element(by, sel).text.strip()
|
|
|
+ except Exception:
|
|
|
+ return ""
|
|
|
+
|
|
|
+ # 1688 页面里 window.context.result.global.globalData.model 含完整商品数据
|
|
|
+ model = driver.execute_script(
|
|
|
+ "return (window.context && window.context.result && "
|
|
|
+ "window.context.result.global && window.context.result.global.globalData "
|
|
|
+ "&& window.context.result.global.globalData.model) || null;"
|
|
|
+ )
|
|
|
+
|
|
|
+ def get_attr(name):
|
|
|
+ """从 featureAttributes 里取指定属性值"""
|
|
|
+ try:
|
|
|
+ attrs = model["offerDetail"]["featureAttributes"]
|
|
|
+ for item in attrs:
|
|
|
+ if item.get("name") == name:
|
|
|
+ return item.get("value", "")
|
|
|
+ except Exception:
|
|
|
+ return ""
|
|
|
+ return ""
|
|
|
+
|
|
|
+ # 价格与 MOQ
|
|
|
+ trade = model.get("tradeModel", {}) if model else {}
|
|
|
+ price_min = trade.get("minPrice", "") or ""
|
|
|
+ price_max = trade.get("maxPrice", "") or ""
|
|
|
+ begin_amount = trade.get("beginAmount", "")
|
|
|
+ # 批发价区间拼接
|
|
|
+ ranges = trade.get("disPriceRanges") or trade.get("currentPrices") or trade.get("offerPriceModel", {}).get("currentPrices", [])
|
|
|
+ range_text = " / ".join(
|
|
|
+ [f"{r.get('beginAmount')}起 ¥{r.get('price') or r.get('discountPrice')}" for r in ranges]
|
|
|
+ ) if ranges else ""
|
|
|
+
|
|
|
+ data = {
|
|
|
+ "品类": (model.get("offerDetail", {}).get("leafCategoryName", "") if model else "")
|
|
|
+ or safe_text(By.CSS_SELECTOR, "div[class*=breadcrumb] a:last-child"),
|
|
|
+ "品牌": get_attr("品牌"),
|
|
|
+ "商品名称": (model.get("offerDetail", {}).get("subject", "") if model else "")
|
|
|
+ or safe_text(By.CSS_SELECTOR, "h1.d-title")
|
|
|
+ or safe_text(By.CSS_SELECTOR, "h1[class*=title]"),
|
|
|
+ "颜色": get_attr("颜色") or safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='颜色']/following-sibling::td[1]//span[@class='field-value']"),
|
|
|
+ "规格尺码": get_attr("尺码") or safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='尺码']/following-sibling::td[1]//span[@class='field-value']"),
|
|
|
+ "材质": get_attr("材质") or safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='材质']/following-sibling::td[1]//span[@class='field-value']"),
|
|
|
+ "单品进价(元)": f"{price_min}-{price_max}" if price_min and price_max and price_min != price_max else f"{price_min}" if price_min else "",
|
|
|
+ "moq(起订量)": begin_amount or safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='起订量']/following-sibling::td[1]//span[@class='field-value']"),
|
|
|
+ "批发进价(元)": range_text,
|
|
|
+ "产品链接": url,
|
|
|
+ "供应商信息": (model.get("sellerModel", {}).get("companyName", "") if model else "")
|
|
|
+ or safe_text(By.CSS_SELECTOR, "a.company-name")
|
|
|
+ or safe_text(By.CSS_SELECTOR, "div.company-name"),
|
|
|
+ }
|
|
|
+ return data
|
|
|
+
|
|
|
+
|
|
|
+def save_to_excel(path: Path, rows: list[dict]):
|
|
|
+ """
|
|
|
+ 将抓取结果追加写入已有格式的表格。
|
|
|
+ - 默认使用首个工作表,并基于模板行复制样式。
|
|
|
+ - 模板行:若存在第3行则用第3行样式,否则用第2行。
|
|
|
+ """
|
|
|
+ if path.exists():
|
|
|
+ wb = load_workbook(path)
|
|
|
+ ws = wb.active
|
|
|
+ else:
|
|
|
+ wb = Workbook()
|
|
|
+ ws = wb.active
|
|
|
+ ws.append(COLUMNS)
|
|
|
+
|
|
|
+ # 选择模板行(通常是设计好的第一行数据样式)
|
|
|
+ template_row_idx = 3 if ws.max_row >= 3 else 2 if ws.max_row >= 2 else 1
|
|
|
+ template_row = ws[template_row_idx]
|
|
|
+
|
|
|
+ # 找到首个“数据区”空行(除编码列外其余列为空),否则追加到末尾
|
|
|
+ data_cols = range(2, len(COLUMNS) + 1) # 跳过编码列
|
|
|
+ first_empty_row = None
|
|
|
+ for r in range(template_row_idx + 1, ws.max_row + 1):
|
|
|
+ if all((ws.cell(row=r, column=c).value in (None, "")) for c in data_cols):
|
|
|
+ first_empty_row = r
|
|
|
+ break
|
|
|
+ insert_row = first_empty_row or (ws.max_row + 1)
|
|
|
+
|
|
|
+ # 计算编码起始值(取首列已有最大数字)
|
|
|
+ last_code = 0
|
|
|
+ for r in range(1, ws.max_row + 1):
|
|
|
+ try:
|
|
|
+ val = ws.cell(row=r, column=1).value
|
|
|
+ if isinstance(val, (int, float)) and val > last_code:
|
|
|
+ last_code = int(val)
|
|
|
+ except Exception:
|
|
|
+ continue
|
|
|
+ next_code = last_code + 1
|
|
|
+
|
|
|
+ for row_data in rows:
|
|
|
+ # 若目标行不存在,扩展表行数;若已存在空行,直接写入避免上移模板序号
|
|
|
+ if insert_row > ws.max_row:
|
|
|
+ ws.append([None] * len(COLUMNS))
|
|
|
+ for col_idx, col_name in enumerate(COLUMNS, start=1):
|
|
|
+ if col_name == "编码":
|
|
|
+ value = row_data.get("编码", next_code)
|
|
|
+ else:
|
|
|
+ value = row_data.get(col_name, "")
|
|
|
+ cell = ws.cell(row=insert_row, column=col_idx, value=value)
|
|
|
+ # 复制模板样式
|
|
|
+ if col_idx <= len(template_row):
|
|
|
+ tmpl = template_row[col_idx - 1]
|
|
|
+ cell._style = copy(tmpl._style)
|
|
|
+ next_code += 1
|
|
|
+ insert_row += 1
|
|
|
+
|
|
|
+ try:
|
|
|
+ wb.save(path)
|
|
|
+ except PermissionError:
|
|
|
+ alt = path.with_name(f"{path.stem}_out_{int(time.time())}{path.suffix}")
|
|
|
+ wb.save(alt)
|
|
|
+ print(f"原文件被占用,已写入副本:{alt}")
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ ITEM_URLS = [
|
|
|
+ "https://detail.1688.com/offer/860913286492.html?src=zhanwai&pid=301011_0000&ptid=017700000007986632e2076d03b97563&exp=enquiry%3AB%3BqueryMobilePhone%3AC%3Bxlyx%3AB&_force_exp_buckets_=11803%2C2024061703%2C2024011602&spm=a312h.2018_new_sem.dh_002.1.f2803d1evDvWwN&cosite=baidujj_pz&tracelog=p4p&_p_isad=1&clickid=73a657ca198445129a0f9657e5b848a5&sessionid=a22bd65f97342308cefe01ce93b2fb30&a=1128&e=Do0-iFADT1PETFOTY8XkCkiE39RAE-Osyk6HP6xWf0u9yqFIKWxE-tv1Y3207LPZ5B-yn5Il1MSwxqPfQ8JvdyS98dA7jdxprtnbuiV4OUqnLO30gnz1.Xd-cgEt.XmRfbc0snn-075TsNtnLsryNlOtFDjT98D.4kjEePgSJhSsTCWyLsXVy0.5ucS4u.tGix-9aJf-M.TJFqBjAJ84c-ZtUAhSFMPxrFV7CUJXRsAfWkEYWS5X9RD2lAIIXvm1vGzlY7ihi3tvyEKFOnqcIOZfewv0q.6g&sk=sem&style=1",
|
|
|
+ 'https://detail.1688.com/offer/711070382704.html?src=zhanwai&pid=301011_0000&ptid=017700000007986632e2076d03b97563&exp=enquiry%3AB%3BqueryMobilePhone%3AA%3Bxlyx%3AB&_force_exp_buckets_=11803%2C2024061701%2C2024011602&spm=a312h.2018_new_sem.dh_002.3.f2803d1eBPrwvm&cosite=baidujj_pz&tracelog=p4p&_p_isad=1&clickid=8f28c35e2dab4a02ac6ab7bde4d85d7e&sessionid=a22bd65f97342308cefe01ce93b2fb30&a=1143&e=zES-dp9sJJ3zrrk0AjWQ.80TNDrQDIxAfEp4OeWz-t73PxVXx2khSRkm4WW-Gc2mWvz9cqAP-EtjxMGkm81MROlxv-X4ECd0aflzSA.u7Sw3XqQVUjl1FuCMFIzXZNqVuhIhieuUBNzHeKI5kOSG2IZTlYUgrDcjviyzqR9QbTS1DK6Qg8fQSsEm2SWtGdfb.otTyBxd30qq6gPLLvqhYiW2FW.t4rGnK1eNs8ThmT6RNZF-ol9mriMaxAlPq2t2MvQtMFuvgdZWZh-v9vZHQsr2UDsUSUJ44V7XlLMFYIk_&sk=sem&style=1',
|
|
|
+
|
|
|
+ # 替换为你的商品链接
|
|
|
+ ]
|
|
|
+ driver = build_driver()
|
|
|
+ try:
|
|
|
+ results = []
|
|
|
+ for url in ITEM_URLS:
|
|
|
+ info = scrape_item(driver, url)
|
|
|
+ results.append(info)
|
|
|
+ print(json.dumps(info, ensure_ascii=False, indent=2))
|
|
|
+ human_wait(1.5) # 控制节奏,避免频繁触发风控
|
|
|
+ save_to_excel(EXCEL_PATH, results)
|
|
|
+ finally:
|
|
|
+ driver.quit()
|