DEV Community

drake
drake

Posted on

playwright自动下载PDF文件

import asyncio from pathlib import Path from patchright.async_api import async_playwright async def handle_pdf_route(route): """拦截 PDF 请求并强制下载""" if route.request.url.endswith('.pdf'): response = await route.fetch() headers = dict(response.headers) response = await route.fetch() binary_data = await response.body() # 直接获取 bytes  print(f"获取到 PDF 文档,大小: {len(binary_data)} bytes") print(f"PDF 文档内容: {binary_data[:100]}...") # 打印前100个字节  # with open("direct_download.pdf", "wb") as f:  # f.write(binary_data)  headers['Content-Disposition'] = 'attachment; filename="document.pdf"' await route.fulfill(response=response, headers=headers) await asyncio.sleep(10) # 等待页面加载完成  else: await route.continue_() async def main(): url = "https://www.fatf-gafi.org/content/dam/fatf-gafi/guidance/Second-12-Month-Review-Revised-FATF-Standards-Virtual-Assets-VASPS.pdf" async with async_playwright() as p: browser = await p.chromium.launch(headless=False) # 可见模式便于调试  context = await browser.new_context(accept_downloads=True) page = await context.new_page() await page.goto(url) # 设置路由拦截  await page.route("**/*", handle_pdf_route) await browser.close() if __name__ == "__main__": asyncio.run(main()) 
Enter fullscreen mode Exit fullscreen mode

Top comments (0)