#!/usr/bin/env python3 """ SmartScraper Pagination Example (Async) This example demonstrates how to use pagination functionality with SmartScraper API using the asynchronous client. """ import asyncio import json import logging import os import time from pydantic import BaseModel from typing import List, Optional from dotenv import load_dotenv from scrapegraph_py import AsyncClient from scrapegraph_py.exceptions import APIError # Load environment variables from .env file load_dotenv() # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", handlers=[logging.StreamHandler()], ) logger = logging.getLogger(__name__) class ProductInfo(BaseModel): """Schema for product information""" name: str price: Optional[str] = None rating: Optional[str] = None image_url: Optional[str] = None description: Optional[str] = None class ProductList(BaseModel): """Schema for list of products""" products: List[ProductInfo] async def smartscraper_pagination_example(): """Example of using pagination with SmartScraper (async)""" print("SmartScraper Pagination Example (Async)") print("=" * 50) # Initialize client from environment variable api_key = os.getenv("SGAI_API_KEY") if not api_key: print("β Error: SGAI_API_KEY environment variable not set") return try: client = AsyncClient(api_key=api_key) except Exception as e: print(f"β Error initializing client: {e}") return # Configuration website_url = "https://www.amazon.in/s?k=tv&crid=1TEF1ZFVLU8R8&sprefix=t%2Caps%2C390&ref=nb_sb_noss_2" user_prompt = "Extract all product info including name, price, rating, image_url, and description" total_pages = 3 # Number of pages to scrape print(f"π Website URL: {website_url}") print(f"π User Prompt: {user_prompt}") print(f"π Total Pages: {total_pages}") print("-" * 50) try: # Start timing start_time = time.time() # Make the request with pagination result = await client.smartscraper( user_prompt=user_prompt, website_url=website_url, output_schema=ProductList, total_pages=total_pages ) # Calculate duration duration = time.time() - start_time print(f"β
Request completed in {duration:.2f} seconds") print(f"π Response type: {type(result)}") # Display results if isinstance(result, dict): print("\nπ Response:") print(json.dumps(result, indent=2, ensure_ascii=False)) # Check for pagination success indicators if "data" in result: print(f"\nβ¨ Pagination successful! Data extracted from {total_pages} pages") elif isinstance(result, list): print(f"\nβ
Pagination successful! Extracted {len(result)} items") for i, item in enumerate(result[:5]): # Show first 5 items print(f" {i+1}. {item}") if len(result) > 5: print(f" ... and {len(result) - 5} more items") else: print(f"\nπ Result: {result}") except APIError as e: print(f"β API Error: {e}") except Exception as e: print(f"β Unexpected error: {e}") async def test_concurrent_pagination(): """Test multiple pagination requests concurrently""" print("\n" + "=" * 50) print("Testing concurrent pagination requests") print("=" * 50) api_key = os.getenv("SGAI_API_KEY") if not api_key: print("β Error: SGAI_API_KEY environment variable not set") return try: client = AsyncClient(api_key=api_key) except Exception as e: print(f"β Error initializing client: {e}") return # Test concurrent requests urls = [ "https://example.com/products?page=1", "https://example.com/products?page=2", "https://example.com/products?page=3", ] tasks = [] for i, url in enumerate(urls): print(f"π Creating task {i+1} for URL: {url}") tasks.append(asyncio.create_task( simulate_pagination_request(client, url, i+1) )) print(f"β±οΈ Starting {len(tasks)} concurrent tasks...") start_time = time.time() try: results = await asyncio.gather(*tasks, return_exceptions=True) duration = time.time() - start_time print(f"β
All tasks completed in {duration:.2f} seconds") for i, result in enumerate(results): if isinstance(result, Exception): print(f"β Task {i+1} failed: {result}") else: print(f"β
Task {i+1} succeeded: {result}") except Exception as e: print(f"β Concurrent execution failed: {e}") async def simulate_pagination_request(client: AsyncClient, url: str, task_id: int): """Simulate a pagination request (for demonstration)""" print(f"π Task {task_id}: Processing {url}") # Simulate some work await asyncio.sleep(0.5) # Return a simulated result return f"Task {task_id} completed successfully" async def main(): """Main function to run the pagination examples""" print("ScrapeGraph SDK - SmartScraper Pagination Examples (Async)") print("=" * 60) # Run the main example await smartscraper_pagination_example() # Test concurrent pagination await test_concurrent_pagination() print("\n" + "=" * 60) print("Examples completed!") print("\nNext steps:") print("1. Set SGAI_API_KEY environment variable") print("2. Replace example URLs with real websites") print("3. Adjust total_pages parameter (1-10)") print("4. Customize user_prompt for your use case") print("5. Define output_schema for structured data") if __name__ == "__main__": asyncio.run(main())