ashishkulkarnii
diff --git a/‎README.md‎
Lines changed: 17 additions & 7 deletions b/‎README.md‎
Lines changed: 17 additions & 7 deletions
diff --git a/‎serpapi/pagination.py‎
Lines changed: 47 additions & 11 deletions b/‎serpapi/pagination.py‎
Lines changed: 47 additions & 11 deletions
diff --git a/‎serpapi/serp_api_client.py‎
Lines changed: 5 additions & 4 deletions b/‎serpapi/serp_api_client.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎tests/test_example_paginate.py‎
Lines changed: 42 additions & 7 deletions b/‎tests/test_example_paginate.py‎
Lines changed: 42 additions & 7 deletions
diff --git a/‎tests/test_google_search.py‎
Lines changed: 5 additions & 6 deletions b/‎tests/test_google_search.py‎
Lines changed: 5 additions & 6 deletions
@@ -469,45 +469,55 @@ Let's collect links accross multiple search result pages.
 ```python
 # to get 2 pages
 start = 0
-end = 20
+end = 40
+page_size = 10
 
 # basic search parameters
-params = {
+parameter = {
  "q": "coca cola",
  "tbm": "nws",
  "api_key": os.getenv("API_KEY"),
+ # optional pagination parameter
+ # the pagination method can take argument directly
  "start": start,
- "end": end
+ "end": end,
+ "num": page_size
 }
 
 # as proof of concept 
 # urls collects
 urls = []
 
 # initialize a search
-search = GoogleSearch(params)
+search = GoogleSearch(parameter)
 
-# create a python generator
+# create a python generator using parameter
 pages = search.pagination()
+# or set custom parameter
+pages = search.pagination(start, end, page_size)
 
 # fetch one search result per iteration 
 # using a basic python for loop 
 # which invokes python iterator under the hood.
 for page in pages:
  print(f"Current page: {page['serpapi_pagination']['current']}")
- 
  for news_result in page["news_results"]:
  print(f"Title: {news_result['title']}\nLink: {news_result['link']}\n")
  urls.append(news_result['link'])
 
 # check if the total number pages is as expected
 # note: the exact number if variable depending on the search engine backend
-self.assertGreater(len(urls), 200)
+if len(urls) == (end - start):
+ print("all search results count match!")
+if len(urls) == len(set(urls)):
+ print("all search results are unique!")
 ```
 
 Examples to fetch links with pagination: [test file](https://github.com/serpapi/google-search-results-python/blob/master/tests/test_example_paginate.py), [online IDE](https://replit.com/@DimitryZub1/Scrape-Google-News-with-Pagination-python-serpapi)
 
 ## Change log
+2021-06-05 @ 2.4.0
+ - add page size support using num parameter
 2021-06-05 @ 2.3.0
  - add pagination support
 2021-04-28 @ 2.2.0
 
@@ -1,20 +1,56 @@
+import pprint
+
+DEFAULT_START = 0 
+DEFAULT_END = 1000000000
+DEFAULT_num = 10
 
 # Paginate response in SearpApi
 class Pagination:
- def __init__(self, client, start = 0, end = 1000000000, page_size = 10):
+ 
+ def __init__(self, client, start = DEFAULT_START, end = DEFAULT_END, num = DEFAULT_num):
+ # serp api client
  self.client = client
+ # range
  self.start = start
  self.end = end
- self.page_size = page_size
- 
+ self.num = num
+
+ # use value from the client
+ if self.start == DEFAULT_START:
+ if 'start' in self.client.params_dict:
+ self.start = self.client.params_dict['start']
+ if self.end == DEFAULT_END:
+ if 'end' in self.client.params_dict:
+ self.end = self.client.params_dict['end']
+ if self.num == DEFAULT_num:
+ if 'num' in self.client.params_dict:
+ self.num = self.client.params_dict['num']
+
+ # basic check
+ if self.start > self.end:
+ raise "start: {} must be less than end: {}".format(self.start, self.end)
+ if(self.start + self.num) > self.end:
+ raise "start + num: {} + {} must be less than end: {}".format(self.start, self.num, self.end)
+
  def __iter__(self):
+ self.update()
  return self
 
- def __next__(self):
- # execute search
+ def update(self):
  self.client.params_dict['start'] = self.start
+ self.client.params_dict['num'] = self.num
+ if self.start == 0:
+ self.client.params_dict['num'] += 1
+ if self.start > 0:
+ self.client.params_dict['start'] += 1
+
+ def __next__(self):
+ # update parameter
+ self.update()
+
+ # execute request
  result = self.client.get_dict()
- 
+
  # stop if backend miss to return serpapi_pagination
  if not 'serpapi_pagination' in result:
  raise StopIteration
@@ -23,11 +59,11 @@ def __next__(self):
  if not 'next' in result['serpapi_pagination']:
  raise StopIteration
 
- # increment page
- self.start += self.page_size
-
- # ends
- if self.start > self.end:
+ # ends if no next page
+ if self.start + self.num > self.end:
  raise StopIteration
+ 
+ # increment start page
+ self.start += self.num
 
  return result
@@ -30,9 +30,10 @@ class SerpApiClient(object):
  BACKEND = "https://serpapi.com"
  SERP_API_KEY = None
 
- def __init__(self, params_dict, engine = None):
+ def __init__(self, params_dict, engine = None, timeout = 60000):
  self.params_dict = params_dict
  self.engine = engine
+ self.timeout = timeout
 
  def construct_url(self, path = "/search"):
  self.params_dict['source'] = 'python'
@@ -53,7 +54,7 @@ def get_response(self, path = '/search'):
  url = None
  try:
  url, parameter = self.construct_url(path)
- response = requests.get(url, parameter, timeout=60000)
+ response = requests.get(url, parameter, timeout=self.timeout)
  return response
  except requests.HTTPError as e:
  print("fail: " + url)
@@ -167,8 +168,8 @@ def get_location(self, q, limit = 5):
  buffer = self.get_results('/locations.json')
  return json.loads(buffer)
 
- def pagination(self, start = 0, end = 1000000000):
+ def pagination(self, start = 0, end = 1000000000, page_size = 10):
  """Return:
  Generator to iterate the search results pagination
  """
- return Pagination(self, start, end)
+ return Pagination(self, start, end, page_size)
@@ -4,6 +4,7 @@
 
 # original code: https://replit.com/@DimitryZub1/Scrape-Google-News-with-Pagination-python-serpapi
 class TestExamplePaginate(unittest.TestCase):
+ 
  @unittest.skipIf((os.getenv("API_KEY") == None), "no api_key provided")
  def test_paginate(self):
  # to get 2 pages
@@ -13,26 +14,60 @@ def test_paginate(self):
  params = {
  "q": "coca cola",
  "tbm": "nws",
- "api_key": os.getenv("API_KEY"),
- "start": start,
- "end": end
+ "api_key": os.getenv("API_KEY")
  }
  # as proof of concept 
  # urls collects
  urls = []
  # initialize a search
  search = GoogleSearch(params)
  # create a python generator
- pages = search.pagination()
+ pages = search.pagination(start, end)
  # fetch one search result per iteration 
  # using a basic python for loop 
  # which invokes python iterator under the hood.
  for page in pages:
- print(f"Current page: {page['serpapi_pagination']['current']}")
+ #print(f"Current page: {page['serpapi_pagination']['current']}")
  for news_result in page["news_results"]:
- print(f"Title: {news_result['title']}\nLink: {news_result['link']}\n")
+ #print(f"Title: {news_result['title']}\nLink: {news_result['link']}\n")
  urls.append(news_result['link'])
  # double check if things adds up.
  # total number pages expected
  # the exact number if variable depending on the search engine backend
- self.assertGreater(len(urls), 200)
+ self.assertEqual(len(urls), 20, "number of search results")
+ self.assertEqual(len(set(urls)), len(urls), "duplicated elements detected")
+
+ @unittest.skipIf((os.getenv("API_KEY") == None), "no api_key provided")
+ def test_paginate_page_size(self):
+ # to get 2 pages with each page contains 20 search results
+ start = 0
+ end = 80
+ page_size = 20
+
+ # use parameters in
+ params = {
+ "q": "coca cola",
+ "tbm": "nws",
+ "api_key": os.getenv("API_KEY"),
+ "start": start,
+ "end": end,
+ "num": page_size
+ }
+ urls = []
+ search = GoogleSearch(params)
+ # parameter start,end,page_size will be used instead of pagination
+ pages = search.pagination()
+ page_count = 0
+ count = 0
+ for page in pages:
+ page_count += 1
+ #print(f"Current page: {page['serpapi_pagination']['current']}")
+ for news_result in page["news_results"]:
+ count += 1
+ print(f"{count} - title: {news_result['title']}")
+ urls.append(news_result['link'])
+ 
+ # check number of pages match
+ self.assertEqual(page_count, 4)
+ self.assertEqual(len(urls), end, "number of search results")
+ self.assertEqual(len(set(urls)), end, "duplicated search results")
@@ -13,14 +13,13 @@ def setUp(self):
 @unittest.skipIf((os.getenv("API_KEY") == None), "no api_key provided")
 def test_paginate(self):
 search = GoogleSearch({"q": "Coffee", "location": "Austin,Texas"})
-pages = search.pagination(0, 20)
-print("display generated")
+pages = search.pagination(0, 20, 10)
 urls = []
-for result in pages:
-urls.append(result['serpapi_pagination']['next'])
+for page in pages:
+urls.append(page['serpapi_pagination']['next'])
 self.assertEqual(len(urls), 2)
-self.assertTrue("start=10" in urls[0])
-self.assertTrue("start=20" in urls[1])
+self.assertTrue("start=11" in urls[0])
+self.assertTrue("start=21" in urls[1])
 
 @unittest.skipIf((os.getenv("API_KEY") == None), "no api_key provided")
 def test_get_json(self):