Skip to content

Commit c2c35bc

Browse files
committed
serpapi#24 implement pagination
1 parent d2f0dbc commit c2c35bc

File tree

5 files changed

+116
-35
lines changed

5 files changed

+116
-35
lines changed

README.md

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -469,45 +469,55 @@ Let's collect links accross multiple search result pages.
469469
```python
470470
# to get 2 pages
471471
start = 0
472-
end = 20
472+
end = 40
473+
page_size = 10
473474

474475
# basic search parameters
475-
params = {
476+
parameter = {
476477
"q": "coca cola",
477478
"tbm": "nws",
478479
"api_key": os.getenv("API_KEY"),
480+
# optional pagination parameter
481+
# the pagination method can take argument directly
479482
"start": start,
480-
"end": end
483+
"end": end,
484+
"num": page_size
481485
}
482486

483487
# as proof of concept
484488
# urls collects
485489
urls = []
486490

487491
# initialize a search
488-
search = GoogleSearch(params)
492+
search = GoogleSearch(parameter)
489493

490-
# create a python generator
494+
# create a python generator using parameter
491495
pages = search.pagination()
496+
# or set custom parameter
497+
pages = search.pagination(start, end, page_size)
492498

493499
# fetch one search result per iteration
494500
# using a basic python for loop
495501
# which invokes python iterator under the hood.
496502
for page in pages:
497503
print(f"Current page: {page['serpapi_pagination']['current']}")
498-
499504
for news_result in page["news_results"]:
500505
print(f"Title: {news_result['title']}\nLink: {news_result['link']}\n")
501506
urls.append(news_result['link'])
502507

503508
# check if the total number pages is as expected
504509
# note: the exact number if variable depending on the search engine backend
505-
self.assertGreater(len(urls), 200)
510+
if len(urls) == (end - start):
511+
print("all search results count match!")
512+
if len(urls) == len(set(urls)):
513+
print("all search results are unique!")
506514
```
507515

508516
Examples to fetch links with pagination: [test file](https://github.com/serpapi/google-search-results-python/blob/master/tests/test_example_paginate.py), [online IDE](https://replit.com/@DimitryZub1/Scrape-Google-News-with-Pagination-python-serpapi)
509517

510518
## Change log
519+
2021-06-05 @ 2.4.0
520+
- add page size support using num parameter
511521
2021-06-05 @ 2.3.0
512522
- add pagination support
513523
2021-04-28 @ 2.2.0

serpapi/pagination.py

Lines changed: 47 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,56 @@
1+
import pprint
2+
3+
DEFAULT_START = 0
4+
DEFAULT_END = 1000000000
5+
DEFAULT_num = 10
16

27
# Paginate response in SearpApi
38
class Pagination:
4-
def __init__(self, client, start = 0, end = 1000000000, page_size = 10):
9+
10+
def __init__(self, client, start = DEFAULT_START, end = DEFAULT_END, num = DEFAULT_num):
11+
# serp api client
512
self.client = client
13+
# range
614
self.start = start
715
self.end = end
8-
self.page_size = page_size
9-
16+
self.num = num
17+
18+
# use value from the client
19+
if self.start == DEFAULT_START:
20+
if 'start' in self.client.params_dict:
21+
self.start = self.client.params_dict['start']
22+
if self.end == DEFAULT_END:
23+
if 'end' in self.client.params_dict:
24+
self.end = self.client.params_dict['end']
25+
if self.num == DEFAULT_num:
26+
if 'num' in self.client.params_dict:
27+
self.num = self.client.params_dict['num']
28+
29+
# basic check
30+
if self.start > self.end:
31+
raise "start: {} must be less than end: {}".format(self.start, self.end)
32+
if(self.start + self.num) > self.end:
33+
raise "start + num: {} + {} must be less than end: {}".format(self.start, self.num, self.end)
34+
1035
def __iter__(self):
36+
self.update()
1137
return self
1238

13-
def __next__(self):
14-
# execute search
39+
def update(self):
1540
self.client.params_dict['start'] = self.start
41+
self.client.params_dict['num'] = self.num
42+
if self.start == 0:
43+
self.client.params_dict['num'] += 1
44+
if self.start > 0:
45+
self.client.params_dict['start'] += 1
46+
47+
def __next__(self):
48+
# update parameter
49+
self.update()
50+
51+
# execute request
1652
result = self.client.get_dict()
17-
53+
1854
# stop if backend miss to return serpapi_pagination
1955
if not 'serpapi_pagination' in result:
2056
raise StopIteration
@@ -23,11 +59,11 @@ def __next__(self):
2359
if not 'next' in result['serpapi_pagination']:
2460
raise StopIteration
2561

26-
# increment page
27-
self.start += self.page_size
28-
29-
# ends
30-
if self.start > self.end:
62+
# ends if no next page
63+
if self.start + self.num > self.end:
3164
raise StopIteration
65+
66+
# increment start page
67+
self.start += self.num
3268

3369
return result

serpapi/serp_api_client.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,10 @@ class SerpApiClient(object):
3030
BACKEND = "https://serpapi.com"
3131
SERP_API_KEY = None
3232

33-
def __init__(self, params_dict, engine = None):
33+
def __init__(self, params_dict, engine = None, timeout = 60000):
3434
self.params_dict = params_dict
3535
self.engine = engine
36+
self.timeout = timeout
3637

3738
def construct_url(self, path = "/search"):
3839
self.params_dict['source'] = 'python'
@@ -53,7 +54,7 @@ def get_response(self, path = '/search'):
5354
url = None
5455
try:
5556
url, parameter = self.construct_url(path)
56-
response = requests.get(url, parameter, timeout=60000)
57+
response = requests.get(url, parameter, timeout=self.timeout)
5758
return response
5859
except requests.HTTPError as e:
5960
print("fail: " + url)
@@ -167,8 +168,8 @@ def get_location(self, q, limit = 5):
167168
buffer = self.get_results('/locations.json')
168169
return json.loads(buffer)
169170

170-
def pagination(self, start = 0, end = 1000000000):
171+
def pagination(self, start = 0, end = 1000000000, page_size = 10):
171172
"""Return:
172173
Generator to iterate the search results pagination
173174
"""
174-
return Pagination(self, start, end)
175+
return Pagination(self, start, end, page_size)

tests/test_example_paginate.py

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
# original code: https://replit.com/@DimitryZub1/Scrape-Google-News-with-Pagination-python-serpapi
66
class TestExamplePaginate(unittest.TestCase):
7+
78
@unittest.skipIf((os.getenv("API_KEY") == None), "no api_key provided")
89
def test_paginate(self):
910
# to get 2 pages
@@ -13,26 +14,60 @@ def test_paginate(self):
1314
params = {
1415
"q": "coca cola",
1516
"tbm": "nws",
16-
"api_key": os.getenv("API_KEY"),
17-
"start": start,
18-
"end": end
17+
"api_key": os.getenv("API_KEY")
1918
}
2019
# as proof of concept
2120
# urls collects
2221
urls = []
2322
# initialize a search
2423
search = GoogleSearch(params)
2524
# create a python generator
26-
pages = search.pagination()
25+
pages = search.pagination(start, end)
2726
# fetch one search result per iteration
2827
# using a basic python for loop
2928
# which invokes python iterator under the hood.
3029
for page in pages:
31-
print(f"Current page: {page['serpapi_pagination']['current']}")
30+
#print(f"Current page: {page['serpapi_pagination']['current']}")
3231
for news_result in page["news_results"]:
33-
print(f"Title: {news_result['title']}\nLink: {news_result['link']}\n")
32+
#print(f"Title: {news_result['title']}\nLink: {news_result['link']}\n")
3433
urls.append(news_result['link'])
3534
# double check if things adds up.
3635
# total number pages expected
3736
# the exact number if variable depending on the search engine backend
38-
self.assertGreater(len(urls), 200)
37+
self.assertEqual(len(urls), 20, "number of search results")
38+
self.assertEqual(len(set(urls)), len(urls), "duplicated elements detected")
39+
40+
@unittest.skipIf((os.getenv("API_KEY") == None), "no api_key provided")
41+
def test_paginate_page_size(self):
42+
# to get 2 pages with each page contains 20 search results
43+
start = 0
44+
end = 80
45+
page_size = 20
46+
47+
# use parameters in
48+
params = {
49+
"q": "coca cola",
50+
"tbm": "nws",
51+
"api_key": os.getenv("API_KEY"),
52+
"start": start,
53+
"end": end,
54+
"num": page_size
55+
}
56+
urls = []
57+
search = GoogleSearch(params)
58+
# parameter start,end,page_size will be used instead of pagination
59+
pages = search.pagination()
60+
page_count = 0
61+
count = 0
62+
for page in pages:
63+
page_count += 1
64+
#print(f"Current page: {page['serpapi_pagination']['current']}")
65+
for news_result in page["news_results"]:
66+
count += 1
67+
print(f"{count} - title: {news_result['title']}")
68+
urls.append(news_result['link'])
69+
70+
# check number of pages match
71+
self.assertEqual(page_count, 4)
72+
self.assertEqual(len(urls), end, "number of search results")
73+
self.assertEqual(len(set(urls)), end, "duplicated search results")

tests/test_google_search.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,13 @@ def setUp(self):
1313
@unittest.skipIf((os.getenv("API_KEY") == None), "no api_key provided")
1414
def test_paginate(self):
1515
search = GoogleSearch({"q": "Coffee", "location": "Austin,Texas"})
16-
pages = search.pagination(0, 20)
17-
print("display generated")
16+
pages = search.pagination(0, 20, 10)
1817
urls = []
19-
for result in pages:
20-
urls.append(result['serpapi_pagination']['next'])
18+
for page in pages:
19+
urls.append(page['serpapi_pagination']['next'])
2120
self.assertEqual(len(urls), 2)
22-
self.assertTrue("start=10" in urls[0])
23-
self.assertTrue("start=20" in urls[1])
21+
self.assertTrue("start=11" in urls[0])
22+
self.assertTrue("start=21" in urls[1])
2423

2524
@unittest.skipIf((os.getenv("API_KEY") == None), "no api_key provided")
2625
def test_get_json(self):

0 commit comments

Comments
 (0)