Skip to content

Instantly share code, notes, and snippets.

@dimitryzub
Created June 18, 2021 19:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dimitryzub/5e1ded440a87bfe88e2d247af652fc7e to your computer and use it in GitHub Desktop.
Save dimitryzub/5e1ded440a87bfe88e2d247af652fc7e to your computer and use it in GitHub Desktop.
baidu_scrape_answer_box
from bs4 import BeautifulSoup
import requests, lxml, re, json
headers = {
"User-Agent":
"Mozilla/5.0 (Linux; Android 10; HD1913) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.105 Mobile Safari/537.36 EdgA/46.1.2.5140"
}
def get_answerbox_result():
html = requests.get('https://www.baidu.com/s?&tn=baidu&wd=jet li',headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
try:
answer_box = []
for result in soup.find_all('div', class_='c-border'):
english_word = result.select_one('.op_dict3_marginRight').text
# british
british_phonetic = result.select_one('.c-color-t+ td .op_dict3_gap_small').text
british_chinese_character = result.select_one('.c-color-t+ td .op_dict3_font14').text
british_audio_link = result.find('a', class_='op_dict3_how_read c-gap-right-middle')['url']
# american
american_phonetic = result.select_one('.c-color-t~ td+ td .op_dict3_gap_small').text
american_chinese_character = result.select_one('.c-color-t~ td+ td .op_dict3_font14').text
american_audio_link = result.find('a', class_='op_dict3_how_read c-gap-right-middle')['url']
defenition_notfixed = result.select_one('.c-gap-bottom-xsmall+ .op_dict3_english_result_table .op_dict_text2').text
# removing all whitespace characters with regex since in not fixed variable they're all over the place.
# replace('\n', '') or strip() methods doesn't helped
defenition_fixed = re.sub(r'\s+', '', defenition_notfixed)
answer_box.append({
'english_word': english_word,
'british': {'phonetic': british_phonetic, 'chinese_character': british_chinese_character, 'audio_link': british_audio_link},
'american': {'phonetic': american_phonetic, 'chinese_character': american_chinese_character, 'audio_link': american_audio_link},
'defenition': defenition_fixed,
})
print(json.dumps(answer_box, indent=2, ensure_ascii=False))
except:
print('No answer box found')
# Output:
'''
[
{
"english_word": "coffee",
"british": {
"phonetic": "[ˈkɒfi]",
"chinese_character": "英",
"audio_link": "https://sp0.baidu.com/-rM1hT4a2gU2pMbgoY3K/gettts?lan=uk&text=coffee&spd=2&source=alading"
},
"american": {
"phonetic": "[ˈkɔːfi]",
"chinese_character": "美",
"audio_link": "https://sp0.baidu.com/-rM1hT4a2gU2pMbgoY3K/gettts?lan=uk&text=coffee&spd=2&source=alading"
},
"defenition": "(烘烤过的)咖啡豆;咖啡粉;咖啡(热饮料);一杯咖啡;"
}
]
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment