Skip to content

Instantly share code, notes, and snippets.

@zhullyb
Created January 21, 2024 06:14
Show Gist options
  • Save zhullyb/7a4114323f14c9cc4b7fda99e657f632 to your computer and use it in GitHub Desktop.
Save zhullyb/7a4114323f14c9cc4b7fda99e657f632 to your computer and use it in GitHub Desktop.
随手手搓的豆瓣爬虫,不会继续维护
ua = 'Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0'
base_url = 'https://movie.douban.com/subject/'
import requests
from bs4 import BeautifulSoup
import json
def get_html(url):
try:
r = requests.get(url, headers={'User-Agent': ua})
r.raise_for_status()
return r.text
except requests.exceptions.HTTPError as e:
print(e)
return None
def beautify(id):
url = base_url + f'{id}/'
try:
html = get_html(url)
soup = BeautifulSoup(html, 'html.parser')
except:
return None
# 获取电影名称
movieName_tag = soup.find('span', property='v:itemreviewed')
movieName = movieName_tag.text if movieName_tag else None
# 获取电影图片链接
movieImage_tag = soup.find('meta', property="og:image")
movieImage = movieImage_tag.get("content") if movieImage_tag else None
# 获取电影信息
info = soup.find('div', id='info')
# 获取导演
movieDirector_tag = info.find('span', string='导演')
movieDirector = movieDirector_tag.find_next('a').text if movieDirector_tag else None
# 获取编剧
movieScreenwriter_tag = info.find('span', string='编剧')
movieScreenwriter = [writer.text for writer in movieScreenwriter_tag.find_next('span', class_='attrs').find_all('a')] if movieScreenwriter_tag else None
# 获取主演
mainActor_tag = info.find('span', string='主演')
mainActor = [actor.text for actor in mainActor_tag.find_next('span', class_='attrs').find_all('a')] if mainActor_tag else None
# 获取电影类型
movieType_tag = info.find('span', string='类型:')
movieType = [genre.text for genre in movieType_tag.find_next_siblings('span', property='v:genre')] if movieType_tag else None
# 获取官方网站
movieWebsite_tag = info.find('span', string='官方网站:')
movieWebsite = movieWebsite_tag.find_next('a').text if movieWebsite_tag else None
# 获取制片国家/地区
movieCountry_tag = info.find('span', string='制片国家/地区:')
movieCountry = movieCountry_tag.find_next_sibling(string=True).strip() if movieCountry_tag else None
# 获取语言
movieLanguage_tag = info.find('span', string='语言:')
movieLanguage = movieLanguage_tag.find_next_sibling(string=True).strip() if movieLanguage_tag else None
# 获取上映日期
releaseDate_tag = info.find('span', string='上映日期:')
releaseDate = " / ".join([ i.text for i in releaseDate_tag.find_all_next('span', property='v:initialReleaseDate') ]) if releaseDate_tag else None
# 获取片长
movieLength_tag = info.find('span', string='片长:')
movieLength = movieLength_tag.find_next('span').text if movieLength_tag else None
# 获取电影简介
movieDescription_tag = soup.find('span', property='v:summary')
movieDescription = movieDescription_tag.text.strip() if movieDescription_tag else None
# 构建电影字典
movie = {
'mid': id,
'name': movieName,
# 'image': movieImage,
'image': "/uploads/" + id + ".jpg",
'director': movieDirector,
'screenwriter': " / ".join(movieScreenwriter) if movieScreenwriter else None,
'mainActor': " / ".join(mainActor) if mainActor else None,
'type': " / ".join(movieType) if movieType else None,
'website': movieWebsite,
'country': movieCountry,
'language': movieLanguage,
'releaseDate': releaseDate,
'length': movieLength,
'description': movieDescription
}
# 写入 JSON 文件
with open(f'{id}.json', 'w', encoding='utf-8') as f:
json.dump(movie, f, ensure_ascii=False, indent=4)
# 下载电影海报
if movieImage:
response = requests.get(movieImage)
with open(f'{id}.jpg', 'wb') as f:
f.write(response.content)
lst = ['24773958',
'25937854',
'26636712',
'26931786',
'26100958',
'25821634',
'3025375',
'1866473',
'26933210',
'6390825',
'25828589',
'30394797',
'30304994',
'30167997',
'1866479',
'34477861',
'7065154',
'26213252',
'3231742',
'10741834',
'24753477',
'25820460',
'6560058',
'30223888',
'6390823']
for i in lst:
print(i)
beautify(i)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment