Skip to content

Instantly share code, notes, and snippets.

@dimitryzub
Created June 18, 2021 19:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dimitryzub/14fab631a8178c36dd7250172fca61a1 to your computer and use it in GitHub Desktop.
Save dimitryzub/14fab631a8178c36dd7250172fca61a1 to your computer and use it in GitHub Desktop.
baidu_scrape_organic_results
from bs4 import BeautifulSoup
import requests, lxml, json
headers = {
"User-Agent":
"Mozilla/5.0 (Linux; Android 10; HD1913) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.105 Mobile Safari/537.36 EdgA/46.1.2.5140"
}
def get_organic_results():
html = requests.get('https://www.baidu.com/s?&tn=baidu&wd=minecraft',headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
baidu_data = []
for result in soup.select('.result.c-container.new-pmd'):
title = result.select_one('.t').text
link = result.select_one('.t').a['href']
displayed_link = result.select_one('.c-showurl').text
snippet = result.select_one('.c-abstract').text
try:
sitelink_title = result.select_one('.op-se-listen-recommend').text
except:
sitelink_title = None
try:
sitelink_link = result.select_one('.op-se-listen-recommend')['herf']
except:
sitelink_link = None
baidu_data.append({
'title': title,
'link': link,
'displayed_link': displayed_link,
'snippet': snippet,
'sitelinks': {'title': sitelink_title, 'link': sitelink_link},
})
print(json.dumps(baidu_data, indent=2, ensure_ascii=False))
# Part of the output:
'''
[
{
"title": "minecraft website - 官方网站 | Minecraft",
"link": "http://www.baidu.com/link?url=_XTFGPU6ibzEJnDEdC4y2_WnTCHh-xaHkiR06lAOA6a",
"displayed_link": "minecraft.net/",
"snippet": "2021年3月3日 我的世界是一款堆方块、不断冒险的游戏。在此购买,或浏览网站了解最新消息和社区的精彩创意!",
"sitelinks": {
"title": null,
"link": null
}
}
]
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment