基于 Playwright 的通用 Python 爬取框架,提供 Pipeline 流程编排、结构化提取、中间件、多格式导出等能力。
pip install -r requirements.txt
playwright install chromiumplaywright/
├── scraper/
│ ├── core/
│ │ └── browser.py # BrowserManager / PageManager / BrowserConfig
│ ├── middleware/
│ │ ├── stealth.py # 反检测脚本
│ │ ├── retry.py # 指数退避重试
│ │ └── rate_limit.py # 令牌桶限速器
│ ├── extractors/
│ │ └── selector.py # Field + Selector 声明式提取
│ ├── pipeline/
│ │ ├── base.py # Pipeline / Step 基类
│ │ └── steps.py # 内置 Step:导航/点击/填表/滚动/等待/导出
│ ├── exporters/
│ │ ├── json_exporter.py # JSON / JSONL
│ │ └── csv_exporter.py # CSV
│ ├── utils/
│ │ └── helpers.py # chunk / random_delay / normalize_url
│ └── scraper.py # Scraper 门面 + 快速 scrape() API
├── examples/
│ └── basic_scraper.py
└── requirements.txt
最简单的用法,函数式 API,开箱即用。
import asyncio
from scraper import scrape, Selector, Field
async def main():
items = await scrape(
"https://books.toscrape.com",
Selector(
item="article.product_pod",
fields=[
Field("title", "h3 a", attr="title"),
Field("price", "p.price_color",
transform=lambda v: float(v.replace("£", ""))),
Field("link", "h3 a", attr="href"),
Field("image", "img.thumbnail", attr="src"),
],
),
output_json="books.json",
wait_selector=".product_pod",
)
print(f"抓取到 {len(items)} 条数据")
asyncio.run(main())链式组合 Step,构建复杂的多步骤抓取流程。
from scraper import (
Pipeline, Scraper, Selector, Field,
NavigateStep, FormStep, ClickAndWaitStep, ExtractStep,
ScrollStep, WaitStep, ExportStep, JSONExporter,
)
async def main():
schema = Selector(item=".result-item", fields=[
Field("title", "h3"),
Field("url", "a", attr="href"),
])
pipeline = (
Pipeline("search-scraper")
# 导航到首页
.add(NavigateStep(url="https://example.com", wait_selector=".search-box"))
# 填入搜索关键词并提交
.add(FormStep(
fields={"#keyword": "playwright"},
submit="#search-btn",
))
# 等待结果加载
.add(WaitStep(selector=".result-item", duration_ms=500))
# 提取第一页数据
.add(ExtractStep(schema))
# 点击下一页并等待刷新
.add(ClickAndWaitStep(text="下一页", wait_selector=".result-item"))
# 提取第二页数据
.add(ExtractStep(schema))
# 写入 JSON
.add(ExportStep(JSONExporter("output/results.json", mode="array", indent=2)))
)
async with Scraper() as scraper:
items = await scraper.run("https://example.com", pipeline)
print(f"共抓取 {len(items)} 条")
asyncio.run(main())run() 接收 URL 列表,自动限速并逐页执行同一个 Pipeline。
async with Scraper() as scraper:
urls = [f"https://example.com/page/{i}" for i in range(1, 11)]
items = await scraper.run(urls, pipeline)逐条处理 + 增量导出,适合长时间运行。
from scraper import CSVExporter
async with Scraper() as scraper:
await scraper.run_with_callback(
urls=urls,
pipeline=pipeline,
exporter=CSVExporter("output/data.csv"),
on_item=lambda item: print(f"处理: {item['title']}"),
on_page_done=lambda url, items: print(f"{url}: {len(items)} 条"),
)Pipeline 是一个线性 Step 链,每个 Step 接收并返回一个 ctx 字典(context)。数据在 Step 之间传递和累积。
| 键 | 含义 |
|---|---|
ctx["page"] |
当前 Playwright Page 对象 |
ctx["url"] |
当前页面 URL |
ctx["items"] |
累积的提取结果 list[dict] |
ctx["data"] |
自由使用的数据槽 |
自定义 Step:
from scraper.pipeline import Step
class ScreenshotStep(Step):
def __init__(self, path: str):
super().__init__(name="screenshot")
self.path = path
async def run(self, ctx: dict) -> dict:
await ctx["page"].screenshot(path=self.path, full_page=True)
return ctx| Step | 用途 |
|---|---|
NavigateStep(url, wait_selector, timeout) |
导航到 URL,可选等待选择器 |
ExtractStep(selector) |
用 Selector 提取结构化数据 |
ClickStep(selector, text, nth, after_delay, force) |
点击元素(CSS / 文本 / 第N个) |
ClickAndWaitStep(selector, text, wait_selector) |
点击后等待目标元素出现 |
FormStep(fields, submit) |
批量填表 + 可选提交按钮 |
HoverStep(selector) |
悬停触发下拉 / 提示 |
SelectStep(selector, value, label) |
下拉选择 <select> |
ScrollStep(direction, steps, delay_ms) |
页面滚动(bottom / infinite) |
WaitStep(selector, duration_ms) |
等待选择器或固定时长 |
ExportStep(exporter) |
导出累积数据 |
Field 描述单个字段,Selector 定义整体提取 schema。
from scraper import Selector, Field
schema = Selector(
item=".product-card", # 单个 item 的容器选择器
base_selector=".main-content", # 可选,限定提取范围
max_items=50, # 可选,最多提取条数
fields=[
Field("title", "h3.product-title"),
Field("url", "a", attr="href"),
Field("img", "img", attr="src", many=True), # 提取所有匹配项为列表
Field("price", ".price",
transform=lambda v: float(v.replace("¥", ""))), # 变换
Field("stock", ".stock", fallback="未知"), # 提取失败时的默认值
],
)
# 提取
items = await schema.extract_all(page)from scraper.middleware import RateLimiter
limiter = RateLimiter(requests_per_second=3)
async with limiter:
await page.goto(url) # 自动等待
# 或传给 Scraper
scraper = Scraper(rate_limiter=RateLimiter(requests_per_second=1.5))from scraper.middleware import retry_call, RetryConfig
html = await retry_call(
page.content,
retry_config=RetryConfig(max_retries=5, base_delay=1.0),
)Scraper 默认开启,自动注入反检测脚本(隐藏 navigator.webdriver、伪造 chrome.runtime 等)。
# 默认开启
scraper = Scraper(stealth=True)
# 手动对单个 page 应用
from scraper.middleware import apply_stealth
await apply_stealth(page)
# 对 context 应用(所有新 page 自动生效)
from scraper.middleware import apply_stealth_context
await apply_stealth_context(context)# JSONL(每行一条 JSON,适合追加写入)
JSONExporter("output.jsonl", mode="lines")
# JSON Array(单个数组,可选 indent 格式化)
JSONExporter("output.json", mode="array", indent=2)
# CSV(自动从首条数据推断表头)
CSVExporter("output.csv")
# 或显式指定表头
CSVExporter("output.csv", headers=["title", "price", "link"])PageManager 包装了 Playwright 原生 Page,增加了安全等待和便捷方法。同时支持 __getattr__ 透传,可直接调用所有原生 API。
page = await scraper.new_page()
# 导航
await page.goto_safe(url)
await page.wait_for_selector_safe(".content")
# 交互
await page.click_safe("#submit")
await page.click_by_text("下一页")
await page.click_nth(".item", 3)
await page.hover_safe(".menu")
await page.fill_safe("#input", "hello")
await page.type_safe("#input", "hello", delay=50) # 逐字输入
await page.press_safe("#input", "Enter")
await page.upload_file_safe("input[type=file]", "/path/to/file")
# 滚动
await page.scroll_to_bottom(steps=5, delay_ms=500)
await page.scroll_infinite(check_selector=".item", max_scrolls=50)
# 提取
text = await page.text(".title")
all_texts = await page.texts(".item")
href = await page.attribute("a", "href")
# 原生 API 透传
await page.locator(".item").nth(0).click()
await page.evaluate("document.title")from scraper import Scraper
from scraper.core import BrowserConfig
config = BrowserConfig(
headless=True, # 无头模式
browser_type="chromium", # chromium / firefox / webkit
viewport={"width": 1920, "height": 1080},
user_agent="Mozilla/5.0 ...", # 自定义 UA
proxy={"server": "http://127.0.0.1:7890"},
block_images=False, # 拦截图片(加速)
block_media=True, # 拦截音视频
timeout=30_000, # 默认超时 ms
)
scraper = Scraper(config=config)from scraper.utils import chunk, normalize_url, random_delay, url_join
# 分块
chunk([1, 2, 3, 4, 5], size=2) # [[1, 2], [3, 4], [5]]
# URL 处理
normalize_url("/path", base="https://example.com") # "https://example.com/path"
url_join("https://example.com", "/api/data") # "https://example.com/api/data"
# 随机延迟(反反爬)
await random_delay(min_ms=500, max_ms=2000)cd playwright
python examples/basic_scraper.py演示了 5 种使用模式:快速抓取、Pipeline、多页翻页、回调处理、复杂交互。
- 请遵守目标网站的
robots.txt和服务条款 - 适当调整
RateLimiter的请求频率,避免对目标服务器造成压力 - 部分网站有更强的反爬机制,可能需要在
BrowserConfig中配置更真实的 UA 或结合代理使用