There are several ways to add headers in scrapy spiders. This can be done for each request manually:
class MySpider(scrapy.Spider): def parse(self, response): yield scrapy.Request(..., headers={"x-token": "123"})
However to automatically add headers to every or specific outgoing scrapy requests the DEAFAULT_REQUEST_HEADERS
setting can be used:
# settings.py DEFAULT_REQUEST_HEADERS = { "User-Agent": "my awesome scrapy robot", }
In case more complex logic is needed like adding headers only to some requests or random User-Agent header a request middleware is the best option:
# middlewares.py import random class RandomUserAgentMiddleware: def __init__(self, user_agents): self.user_agents = user_agents @classmethod def from_crawler(cls, crawler): """retrieve user agent list from settings.USER_AGENTS""" user_agents = crawler.settings.get('USER_AGENTS', []) if not user_agents: raise ValueError('No user agents found in settings. Please provide a list of user agents in the USER_AGENTS setting.') return cls(user_agents) def process_request(self, request, spider): """attach random user agent to every outgoing request""" user_agent = random.choice(self.user_agents) request.headers.setdefault('User-Agent', user_agent) spider.logger.debug(f'Using User-Agent: {user_agent}') # settings.py MIDDLEWARES = { # ... 'myproject.middlewares.RandomUserAgentMiddleware': 760, # ... } USER_AGENTS = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', # ... ]
Note that if you're using Scrapfly's scrapy SDK some headers like User-Agent string are automatically by the smart anti-blocking API.