Sfoglia il codice sorgente

Add .gitignore and reference files before overwriting master

LuTong 2 mesi fa
parent
commit
bbb9dc5de9
3 ha cambiato i file con 312 aggiunte e 0 eliminazioni
  1. 29 0
      .gitignore
  2. 92 0
      refe/pyqt.py
  3. 191 0
      refe/req.py

+ 29 - 0
.gitignore

@@ -0,0 +1,29 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+
+# PyInstaller
+build/
+dist/
+*.spec
+
+# IDE
+.idea/
+.vscode/
+
+# Browser profiles / data
+*_profile/
+*_user_data/
+*_session/
+edge_profile/
+chrome_stable_profile/
+1688_edge_pure_session/
+1688_edge_session/
+1688_edge_stable_session/
+1688_edge_ultimate_session/
+1688_user_data/
+
+# Excel temp files
+~$*
+

+ 92 - 0
refe/pyqt.py

@@ -0,0 +1,92 @@
+import sys, json, time
+from pathlib import Path
+# from PyQt5.QtWidgets import (
+from PyQt6.QtWidgets import (
+    QApplication, QWidget, QVBoxLayout, QHBoxLayout,
+    QTextEdit, QPushButton, QLabel, QMessageBox
+)
+from PyQt6.QtCore import Qt, QThread, pyqtSignal
+# from PyQt5.QtCore import Qt, QThread, pyqtSignal
+
+# 引入你现有的函数
+from req import build_driver, scrape_item, save_to_excel, EXCEL_PATH
+
+class Worker(QThread):
+    log = pyqtSignal(str)
+    done = pyqtSignal()
+
+    def __init__(self, urls):
+        super().__init__()
+        self.urls = urls
+
+
+    def run(self):
+        driver = build_driver()
+        results = []
+        try:
+            for url in self.urls:
+                if not url.strip():
+                    continue
+                self.log.emit(f"开始抓取: {url}")
+                info = scrape_item(driver, url.strip())
+                results.append(info)
+                self.log.emit(json.dumps(info, ensure_ascii=False))
+                time.sleep(1.0)
+            if results:
+                save_to_excel(EXCEL_PATH, results)
+                self.log.emit(f"写入完成:{EXCEL_PATH}")
+        except Exception as e:
+            self.log.emit(f"出错: {e}")
+        finally:
+            driver.quit()
+            self.done.emit()
+
+class App(QWidget):
+    def __init__(self):
+        super().__init__()
+        self.setWindowTitle("1688抓取")
+        self.resize(800, 600)
+
+        self.input = QTextEdit()
+        self.input.setPlaceholderText("每行一个商品链接")
+
+        self.log = QTextEdit()
+        self.log.setReadOnly(True)
+
+        self.btn = QPushButton("开始抓取")
+        self.btn.clicked.connect(self.start)
+
+        layout = QVBoxLayout()
+        layout.addWidget(QLabel("商品链接:"))
+        layout.addWidget(self.input, 3)
+        layout.addWidget(self.btn)
+        layout.addWidget(QLabel("日志:"))
+        layout.addWidget(self.log, 4)
+        self.setLayout(layout)
+
+        self.worker = None
+
+    def start(self):
+        urls = [u for u in self.input.toPlainText().splitlines() if u.strip()]
+        if not urls:
+            QMessageBox.warning(self, "提示", "请先输入链接")
+            return
+        self.btn.setEnabled(False)
+        self.worker = Worker(urls)
+        self.worker.log.connect(self.append_log)
+        self.worker.done.connect(self.finish)
+        self.worker.start()
+
+    def append_log(self, text):
+        self.log.append(text)
+
+    def finish(self):
+        self.btn.setEnabled(True)
+        self.log.append("任务完成")
+
+if __name__ == "__main__":
+    app = QApplication(sys.argv)
+    w = App()
+    w.show()
+    sys.exit(app.exec())
+    # sys.exit(app.exec_())

+ 191 - 0
refe/req.py

@@ -0,0 +1,191 @@
+import json
+import time
+from copy import copy
+from pathlib import Path
+
+from openpyxl import Workbook, load_workbook
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+
+DEBUG_ADDR = "127.0.0.1:9222"  # 与启动 Chrome 时的端口一致
+WAIT = 15
+
+#  填入excel表格的路径
+EXCEL_PATH = Path(
+    r"C:\Users\Meng\PycharmProjects\PythonProject\【进价】产品信息空表.xlsx"
+)
+
+
+COLUMNS = [
+    "编码",
+    "品类",
+    "品牌",
+    "商品名称",
+    "颜色",
+    "规格尺码",
+    "材质",
+    "单品进价(元)",
+    "moq(起订量)",
+    "批发进价(元)",
+    "产品链接",
+    "供应商信息",
+]
+
+def build_driver():
+    opts = Options()
+    opts.add_experimental_option("debuggerAddress", DEBUG_ADDR)
+    opts.add_argument("--start-maximized")
+    # 若需要规避自动化特征可按需添加:
+    opts.add_argument("--disable-blink-features=AutomationControlled")
+    return webdriver.Chrome(options=opts)
+
+def human_wait(sec=1.2):
+    time.sleep(sec)
+
+def scrape_item(driver, url):
+    driver.get(url)
+    WebDriverWait(driver, WAIT).until(
+        EC.presence_of_element_located((By.TAG_NAME, "body"))
+    )
+    human_wait()
+
+    def safe_text(by, sel):
+        try:
+            return driver.find_element(by, sel).text.strip()
+        except Exception:
+            return ""
+
+    # 1688 页面里 window.context.result.global.globalData.model 含完整商品数据
+    model = driver.execute_script(
+        "return (window.context && window.context.result && "
+        "window.context.result.global && window.context.result.global.globalData "
+        "&& window.context.result.global.globalData.model) || null;"
+    )
+
+    def get_attr(name):
+        """从 featureAttributes 里取指定属性值"""
+        try:
+            attrs = model["offerDetail"]["featureAttributes"]
+            for item in attrs:
+                if item.get("name") == name:
+                    return item.get("value", "")
+        except Exception:
+            return ""
+        return ""
+
+    # 价格与 MOQ
+    trade = model.get("tradeModel", {}) if model else {}
+    price_min = trade.get("minPrice", "") or ""
+    price_max = trade.get("maxPrice", "") or ""
+    begin_amount = trade.get("beginAmount", "")
+    # 批发价区间拼接
+    ranges = trade.get("disPriceRanges") or trade.get("currentPrices") or trade.get("offerPriceModel", {}).get("currentPrices", [])
+    range_text = " / ".join(
+        [f"{r.get('beginAmount')}起 ¥{r.get('price') or r.get('discountPrice')}" for r in ranges]
+    ) if ranges else ""
+
+    data = {
+        "品类": (model.get("offerDetail", {}).get("leafCategoryName", "") if model else "")
+               or safe_text(By.CSS_SELECTOR, "div[class*=breadcrumb] a:last-child"),
+        "品牌": get_attr("品牌"),
+        "商品名称": (model.get("offerDetail", {}).get("subject", "") if model else "")
+                 or safe_text(By.CSS_SELECTOR, "h1.d-title")
+                 or safe_text(By.CSS_SELECTOR, "h1[class*=title]"),
+        "颜色": get_attr("颜色") or safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='颜色']/following-sibling::td[1]//span[@class='field-value']"),
+        "规格尺码": get_attr("尺码") or safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='尺码']/following-sibling::td[1]//span[@class='field-value']"),
+        "材质": get_attr("材质") or safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='材质']/following-sibling::td[1]//span[@class='field-value']"),
+        "单品进价(元)": f"{price_min}-{price_max}" if price_min and price_max and price_min != price_max else f"{price_min}" if price_min else "",
+        "moq(起订量)": begin_amount or safe_text(By.XPATH, "//div[@id='productAttributes']//th[span='起订量']/following-sibling::td[1]//span[@class='field-value']"),
+        "批发进价(元)": range_text,
+        "产品链接": url,
+        "供应商信息": (model.get("sellerModel", {}).get("companyName", "") if model else "")
+                   or safe_text(By.CSS_SELECTOR, "a.company-name")
+                   or safe_text(By.CSS_SELECTOR, "div.company-name"),
+    }
+    return data
+
+
+def save_to_excel(path: Path, rows: list[dict]):
+    """
+    将抓取结果追加写入已有格式的表格。
+    - 默认使用首个工作表,并基于模板行复制样式。
+    - 模板行:若存在第3行则用第3行样式,否则用第2行。
+    """
+    if path.exists():
+        wb = load_workbook(path)
+        ws = wb.active
+    else:
+        wb = Workbook()
+        ws = wb.active
+        ws.append(COLUMNS)
+
+    # 选择模板行(通常是设计好的第一行数据样式)
+    template_row_idx = 3 if ws.max_row >= 3 else 2 if ws.max_row >= 2 else 1
+    template_row = ws[template_row_idx]
+
+    # 找到首个“数据区”空行(除编码列外其余列为空),否则追加到末尾
+    data_cols = range(2, len(COLUMNS) + 1)  # 跳过编码列
+    first_empty_row = None
+    for r in range(template_row_idx + 1, ws.max_row + 1):
+        if all((ws.cell(row=r, column=c).value in (None, "")) for c in data_cols):
+            first_empty_row = r
+            break
+    insert_row = first_empty_row or (ws.max_row + 1)
+
+    # 计算编码起始值(取首列已有最大数字)
+    last_code = 0
+    for r in range(1, ws.max_row + 1):
+        try:
+            val = ws.cell(row=r, column=1).value
+            if isinstance(val, (int, float)) and val > last_code:
+                last_code = int(val)
+        except Exception:
+            continue
+    next_code = last_code + 1
+
+    for row_data in rows:
+        # 若目标行不存在,扩展表行数;若已存在空行,直接写入避免上移模板序号
+        if insert_row > ws.max_row:
+            ws.append([None] * len(COLUMNS))
+        for col_idx, col_name in enumerate(COLUMNS, start=1):
+            if col_name == "编码":
+                value = row_data.get("编码", next_code)
+            else:
+                value = row_data.get(col_name, "")
+            cell = ws.cell(row=insert_row, column=col_idx, value=value)
+            # 复制模板样式
+            if col_idx <= len(template_row):
+                tmpl = template_row[col_idx - 1]
+                cell._style = copy(tmpl._style)
+        next_code += 1
+        insert_row += 1
+
+    try:
+        wb.save(path)
+    except PermissionError:
+        alt = path.with_name(f"{path.stem}_out_{int(time.time())}{path.suffix}")
+        wb.save(alt)
+        print(f"原文件被占用,已写入副本:{alt}")
+
+
+if __name__ == "__main__":
+    ITEM_URLS = [
+        "https://detail.1688.com/offer/860913286492.html?src=zhanwai&pid=301011_0000&ptid=017700000007986632e2076d03b97563&exp=enquiry%3AB%3BqueryMobilePhone%3AC%3Bxlyx%3AB&_force_exp_buckets_=11803%2C2024061703%2C2024011602&spm=a312h.2018_new_sem.dh_002.1.f2803d1evDvWwN&cosite=baidujj_pz&tracelog=p4p&_p_isad=1&clickid=73a657ca198445129a0f9657e5b848a5&sessionid=a22bd65f97342308cefe01ce93b2fb30&a=1128&e=Do0-iFADT1PETFOTY8XkCkiE39RAE-Osyk6HP6xWf0u9yqFIKWxE-tv1Y3207LPZ5B-yn5Il1MSwxqPfQ8JvdyS98dA7jdxprtnbuiV4OUqnLO30gnz1.Xd-cgEt.XmRfbc0snn-075TsNtnLsryNlOtFDjT98D.4kjEePgSJhSsTCWyLsXVy0.5ucS4u.tGix-9aJf-M.TJFqBjAJ84c-ZtUAhSFMPxrFV7CUJXRsAfWkEYWS5X9RD2lAIIXvm1vGzlY7ihi3tvyEKFOnqcIOZfewv0q.6g&sk=sem&style=1",
+        'https://detail.1688.com/offer/711070382704.html?src=zhanwai&pid=301011_0000&ptid=017700000007986632e2076d03b97563&exp=enquiry%3AB%3BqueryMobilePhone%3AA%3Bxlyx%3AB&_force_exp_buckets_=11803%2C2024061701%2C2024011602&spm=a312h.2018_new_sem.dh_002.3.f2803d1eBPrwvm&cosite=baidujj_pz&tracelog=p4p&_p_isad=1&clickid=8f28c35e2dab4a02ac6ab7bde4d85d7e&sessionid=a22bd65f97342308cefe01ce93b2fb30&a=1143&e=zES-dp9sJJ3zrrk0AjWQ.80TNDrQDIxAfEp4OeWz-t73PxVXx2khSRkm4WW-Gc2mWvz9cqAP-EtjxMGkm81MROlxv-X4ECd0aflzSA.u7Sw3XqQVUjl1FuCMFIzXZNqVuhIhieuUBNzHeKI5kOSG2IZTlYUgrDcjviyzqR9QbTS1DK6Qg8fQSsEm2SWtGdfb.otTyBxd30qq6gPLLvqhYiW2FW.t4rGnK1eNs8ThmT6RNZF-ol9mriMaxAlPq2t2MvQtMFuvgdZWZh-v9vZHQsr2UDsUSUJ44V7XlLMFYIk_&sk=sem&style=1',
+
+        # 替换为你的商品链接
+    ]
+    driver = build_driver()
+    try:
+        results = []
+        for url in ITEM_URLS:
+            info = scrape_item(driver, url)
+            results.append(info)
+            print(json.dumps(info, ensure_ascii=False, indent=2))
+            human_wait(1.5)  # 控制节奏,避免频繁触发风控
+        save_to_excel(EXCEL_PATH, results)
+    finally:
+        driver.quit()