DEV Community

Cover image for Scrape Baidu Organic Search with Python
Dmitriy Zub ☀️
Dmitriy Zub ☀️

Posted on • Edited on

Scrape Baidu Organic Search with Python

Contents: intro, imports, organic result, answer box, related images, differences, links, outro.

Intro

This blog post is a collection of examples on how to scrape certain Baidu Search Results using Python as well as using an alternative solution SerpApi that you can stack on top of each other to suits your particular needs.

Imports

from bs4 import BeautifulSoup import requests, lxml, json from serpapi import BaiduSearch # only for SerpApi solution import os # only used with SerpApi to create environment for API_KEY 
Enter fullscreen mode Exit fullscreen mode

Organic Results

from bs4 import BeautifulSoup import requests, lxml, json headers = { "User-Agent": "Mozilla/5.0 (Linux; Android 10; HD1913) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.105 Mobile Safari/537.36 EdgA/46.1.2.5140" } def get_organic_results(): html = requests.get('https://www.baidu.com/s?&tn=baidu&wd=minecraft',headers=headers) soup = BeautifulSoup(html.text, 'lxml') baidu_data = [] for result in soup.select('.result.c-container.new-pmd'): title = result.select_one('.t').text link = result.select_one('.t').a['href'] displayed_link = result.select_one('.c-showurl').text snippet = result.select_one('.c-abstract').text try: sitelink_title = result.select_one('.op-se-listen-recommend').text except: sitelink_title = None try: sitelink_link = result.select_one('.op-se-listen-recommend')['herf'] except: sitelink_link = None baidu_data.append({ 'title': title, 'link': link, 'displayed_link': displayed_link, 'snippet': snippet, 'sitelinks': {'title': sitelink_title, 'link': sitelink_link}, }) print(json.dumps(baidu_data, indent=2, ensure_ascii=False)) # Part of the output: ''' [ { "title": "minecraft website - 官方网站 | Minecraft", "link": "http://www.baidu.com/link?url=_XTFGPU6ibzEJnDEdC4y2_WnTCHh-xaHkiR06lAOA6a", "displayed_link": "minecraft.net/", "snippet": "2021年3月3日 我的世界是一款堆方块、不断冒险的游戏。在此购买,或浏览网站了解最新消息和社区的精彩创意!", "sitelinks": { "title": null, "link": null } } ] ''' 
Enter fullscreen mode Exit fullscreen mode

Using Baidu Organic Search Results API

import os, json from serpapi import BaiduSearch def get_organic_results(): params = { "engine": "baidu", "q": "minecraft", "api_key": os.getenv("API_KEY"), } search = BaiduSearch(params) results = search.get_dict() baidu_data = [] for result in results['organic_results']: title = result['title'] link = result['link'] try: displayed_link = result['displayed_link'] except: displayed_link = None try: snippet = result['snippet'] except: snippet = None try: sitelink_title = result['rich_snippet']['sitelinks']['title'] except: sitelink_title = None try: sitelink_link = result['rich_snippet']['sitelinks']['link'] except: sitelink_link = None baidu_data.append({ 'title': title, 'link': link, 'displayed_link': displayed_link, 'snippet': snippet, 'sitelinks': [{'title': sitelink_title, 'link':sitelink_link}], }) print(json.dumps(baidu_data, indent=2, ensure_ascii=False)) # Part of the output: ''' [ { "title": "minecraft website - 官方网站 | Minecraft", "link": "http://www.baidu.com/link?url=OD7rfRPzLty76yZJ9dimCAV2VS-QyXURXbLmjXH3wq3", "displayed_link": "minecraft.net/", "snippet": "我的世界是一款堆方块、不断冒险的游戏。在此购买,或浏览网站了解最新消息和社区的精彩创意!", "sitelinks": [ { "title": null, "link": null } ] } ] ''' 
Enter fullscreen mode Exit fullscreen mode

Answer Box

from bs4 import BeautifulSoup import requests, lxml, re, json headers = { "User-Agent": "Mozilla/5.0 (Linux; Android 10; HD1913) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.105 Mobile Safari/537.36 EdgA/46.1.2.5140" } def get_answerbox_result(): html = requests.get('https://www.baidu.com/s?&tn=baidu&wd=jet li',headers=headers) soup = BeautifulSoup(html.text, 'lxml') try: answer_box = [] for result in soup.find_all('div', class_='c-border'): english_word = result.select_one('.op_dict3_marginRight').text # british  british_phonetic = result.select_one('.c-color-t+ td .op_dict3_gap_small').text british_chinese_character = result.select_one('.c-color-t+ td .op_dict3_font14').text british_audio_link = result.find('a', class_='op_dict3_how_read c-gap-right-middle')['url'] # american  american_phonetic = result.select_one('.c-color-t~ td+ td .op_dict3_gap_small').text american_chinese_character = result.select_one('.c-color-t~ td+ td .op_dict3_font14').text american_audio_link = result.find('a', class_='op_dict3_how_read c-gap-right-middle')['url'] defenition_notfixed = result.select_one('.c-gap-bottom-xsmall+ .op_dict3_english_result_table .op_dict_text2').text # removing all whitespace characters with regex since in not fixed variable they're all over the place.  # replace('\n', '') or strip() methods doesn't helped  defenition_fixed = re.sub(r'\s+', '', defenition_notfixed) answer_box.append({ 'english_word': english_word, 'british': {'phonetic': british_phonetic, 'chinese_character': british_chinese_character, 'audio_link': british_audio_link}, 'american': {'phonetic': american_phonetic, 'chinese_character': american_chinese_character, 'audio_link': american_audio_link}, 'defenition': defenition_fixed, }) print(json.dumps(answer_box, indent=2, ensure_ascii=False)) except: print('No answer box found') # Output: ''' [ { "english_word": "coffee", "british": { "phonetic": "[ˈkɒfi]", "chinese_character": "英", "audio_link": "https://sp0.baidu.com/-rM1hT4a2gU2pMbgoY3K/gettts?lan=uk&text=coffee&spd=2&source=alading" }, "american": { "phonetic": "[ˈkɔːfi]", "chinese_character": "美", "audio_link": "https://sp0.baidu.com/-rM1hT4a2gU2pMbgoY3K/gettts?lan=uk&text=coffee&spd=2&source=alading" }, "defenition": "(烘烤过的)咖啡豆;咖啡粉;咖啡(热饮料);一杯咖啡;" } ] ''' 
Enter fullscreen mode Exit fullscreen mode

Using SerpApi Answer box

import os, json from serpapi import BaiduSearch def get_answerbox_result(): params = { "engine": "baidu", "q": "coffee", "api_key": os.getenv("API_KEY"), } search = BaiduSearch(params) results = search.get_dict() for result in results['answer_box']: title = result['title'] link = result['link'] displayed_link = result['displayed_link'] english_word = result['english_word'] british = result['british'] american = result['american'] defenitions = result['definitions'][0] # array output  print(f'{title}\n{link}\n{displayed_link}\n{english_word}\n{british}\n{american}\n{defenitions}') # Output: ''' coffee - 百度翻译 http://www.baidu.com/link?url=JA5gottCkKOdztdz_enXoECH2LfUZwlDRs-ll_E7fa6TXpjY6hQzf1GzPU7gTxHkOTOTFpSm6g_6OlvRNqjjP_ fanyi.baidu.com coffee {'phonetic': '[ˈkɒfi]', 'chinese_character': '英', 'audio_link': 'https://sp0.baidu.com/-rM1hT4a2gU2pMbgoY3K/gettts?lan=uk&text=coffee&spd=2&source=alading'} {'phonetic': '[ˈkɔːfi]', 'chinese_character': '美', 'audio_link': 'https://sp0.baidu.com/-rM1hT4a2gU2pMbgoY3K/gettts?lan=en&text=coffee&spd=2&source=alading'} ['n. (烘烤过的)咖啡豆; 咖啡粉; 咖啡(热饮料); 一杯咖啡;'] ''' 
Enter fullscreen mode Exit fullscreen mode

Related Images

from bs4 import BeautifulSoup import requests, lxml, re, json headers = { "User-Agent": "Mozilla/5.0 (Linux; Android 10; HD1913) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.105 Mobile Safari/537.36 EdgA/46.1.2.5140" } def get_related_images_result(): html = requests.get('https://www.baidu.com/s?&tn=baidu&wd=jet li',headers=headers) soup = BeautifulSoup(html.text, 'lxml') for related_image in soup.find_all('div', class_='op-img-address-divide-high'): link = related_image.a['href'] image = related_image.a.img['src'] print(f'{link}\n{image}\n') # part of output: ''' http://www.baidu.com/link?url=eSWgjIjvJxs9ihAwqPFMk0w0oeXDbwJ7Hi4mYNJzirGQ1ssl8BuLkI7GhtPPou-J2tYlh7CaMQhGC8uAStmiI7Kx2Ln8mNBobjTQ8J8elSeHIHbKy2UKJPMNB8Jv8C6JxzjRlSeOVeGhmGqg0HvT69706LMw5k7KX5V4aKLgkfTrDjYLwG1b9wRG_n4G752-MLNP_u0rJLwS0PGKAdIctA-oStoNf8efPJZmkExIpA6GZQ1-T0YyA445E9uAtWldweZwOFrZ5H-KzkT5xKW3e33kFyGrQV5Rb_li6YZ6VZ8M4K3ESwO6tzEex_eZxq_xrhRGddDw1LHTn1NmXqvsrkCEpPze5oAtsXNEaSMnSENi3q_qpTucgaWN8eDYk4ciQr42JVuv1cgrHKSf4_0dNwBhiAQB8uj6UIJFDZ-tFAIX1O2ZWQGhoBgpVm7DjVIVoVVraQx9PwZVTq80P3DhhH91U6QkSh4y1LmZJxHZVnRQ-_pZUJKircxw9ofSrgwSWNxkYo6NXwwn9ys9ggz12PHJo5IvjJRGFIlaEm1ZZHfuSfEusdI71L9RQWuSrWpxJiMqS-oqe_pSNgYxPD1PK_ https://dss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=2262600612,448000708&fm=26&gp=0.jpg ''' 
Enter fullscreen mode Exit fullscreen mode

Using SerpApi Related Images

import os, json from serpapi import BaiduSearch def get_related_images_result(): params = { "engine": "baidu", "q": "jet li", "api_key": os.getenv("API_KEY"), } search = BaiduSearch(params) results = search.get_dict() for result in results['organic_results']: try: images = result['related_images'] except: images = None print(images) # part of the output: ''' [{'link': 'http://www.baidu.com/link?url=YQnuO4njMj88UErWJBkGuS4aGdNiv9ZVtySw5fqiVpRTwmgJFEm_ZhCw9Zbc7U1C3Red20zd6N-FzwpURm5jDcnUsp34rhTHApNvnHuB3DlhwIu7-4BwuzlITjhSrXr0DgMBZGNt3UhgGNVTrybeZ6IPGD8Ej_oqSASrusItTQiAVlW-khcZ0A8Q1oWo6Dea_9u1gigFS30GAwBJGz4RdrnFmcyAo7AshuflPdptpcLWqx5TTYF0WjjQVVULBSRmETaEfEGIuO_YMoOKqGoc9d9d9o9QUmRClayPSf5xTppjPGYQGZmUDJ-93grTkqry63e4nXW460Lf-8ctZfnV36UTpWm-hmhXHw7pjATVT88Rmvbxo_hVLyH0dUNdapqsqTdl6YBYFA4k1JjmR5ibhDHd5tH1QuBc5XJVoG1HL-dxNjU_a3NecDeejZstG9zAr59ESZli63E8tgX1THSJ0xeY9G9VOZI-dx79kSg0pUyzctaux8jHWlh48D7qcg5sJCDh_V33kOnhTp9pbJqI3DR4r05Ma_WowxYUV87-pkMxmSnPXtK8Av6lCQgvz7tAFSmzLoPWmz5Fd_cSJ_yB7a', 'image': 'https://dss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=2262600612,448000708&fm=26&gp=0.jpg'}] ''' 
Enter fullscreen mode Exit fullscreen mode

Differences between API and non-API solution

  • fast solution and straightforward exploration.
  • don't have to figure out how to grab certain elements.
  • don't have to maintain the parser if things are changed in the HTML code.

Links

Code in the online IDE.
Documentation: Baidu Organic Search Results API.

Outro

If you have any questions or something isn't working correctly or you want to write something else, feel free to drop a comment in the comment section or via Twitter at @serp_api.

Yours,
Dimitry, and the rest of SerpApi Team.

Top comments (0)