Skip to content

Instantly share code, notes, and snippets.

@Jerry0420
Last active August 11, 2020 08:52
Show Gist options
  • Save Jerry0420/b3c2eaea780096bc3328800fd1b3572d to your computer and use it in GitHub Desktop.
Save Jerry0420/b3c2eaea780096bc3328800fd1b3572d to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup
import json
import re
def main(document):
document = BeautifulSoup(document, 'lxml')
result = {}
url_block = document.select_one('meta[property="og:url"]')
movie_info_block = document.select_one('.movie_intro_info_r')
name_ch_block = movie_info_block.select_one('h1') if movie_info_block else ''
name_en_block = movie_info_block.select_one('h3') if movie_info_block else ''
genres_blocks = movie_info_block.select('.level_name')
all_info = movie_info_block.select('span')
big_image_block = document.select_one('.movie_intro_info_l .btn_zoomin')
image_block = document.select_one('meta[property="og:image"]')
content_block = document.select_one('#story')
yahoo_score_block = document.select_one('.score_num.count')
vote_count_block = document.select_one('.starbox2 span')
try:
result['url'] = url_block['content']
result['movie_id'] = url.split('=')[-1]
result['name_ch'] = name_ch_block.text if name_ch_block else ''
result['name_en'] = name_en_block.text if name_en_block else ''
genres = ""
for genres_block in genres_blocks:
genres += genres_block.text.strip() + '|'
result['genres'] = genres[:-1] if genres else ''
release_date = ''
company = ''
imdb_score = ''
directors = ''
actors = ''
for info in all_info:
if '上映日期' in info.text:
release_date = info.text.split(':')[-1]
if '發行公司' in info.text:
company = info.text.split(':')[-1]
if 'IMDb分數' in info.text:
imdb_score = info.text.split(':')[-1]
if '導演' in info.text:
directors = info.findNext('div').text.strip().replace(' ', '').replace('\n', '').replace('、', '|')
if '演員' in info.text:
actors = info.findNext('div').text.strip().replace(' ', '').replace('\n', '').replace('、', '|')
result['release_date'] = release_date
result['company'] = company
result['imdb_score'] = imdb_score
result['directors'] = directors
result['actors'] = actors
img_url = ''
if big_image_block:
img_url = big_image_block['href']
elif image_block:
img_url = image_block['content']
result['img_url'] = img_url
result['content'] = content_block.text.strip().replace('\r', '').replace('\n', '') if content_block else ''
# 滿分 5
result['yahoo_score'] = yahoo_score_block.text if yahoo_score_block else ''
vote_count = vote_count_block.text if vote_count_block else ''
vote_count = re.findall(r'\d+', vote_count)
result['vote_count'] = vote_count[0] if vote_count else ''
except Exception as error:
print(error)
return result
if __name__ == "__main__":
url = 'https://movies.yahoo.com.tw/movieinfo_main.html/id=3'
response = requests.get(url)
result = main(response.content)
result = json.dumps(result, sort_keys=True, indent=4, ensure_ascii=False)
print(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment