Skip to content

Instantly share code, notes, and snippets.

@iydon
Created January 29, 2020 07:13
Show Gist options
  • Save iydon/01ed625bbd30b5c34519cc4eb69338e3 to your computer and use it in GitHub Desktop.
Save iydon/01ed625bbd30b5c34519cc4eb69338e3 to your computer and use it in GitHub Desktop.
Bilibili 搜索
import collections
import json
import re
import requests
import tqdm
from bs4 import BeautifulSoup
def bilibili_search(keyword, pages=[1], order='', duration=0, typeid=0, headers=dict()):
'''Bilibili 综合搜索
:Argument:
- keyword
- pages: Iterable
- order: {totalrank: 综合排序, click: 最多点击, pubdate: 最新发布, dm: 最多弹幕, stow: 最多收藏}
- duration: {0: 全部时长, 1: 10分钟以下, 2: 10-30分钟, 3: 30-60分钟, 4: 60分钟以上}
- typeid: {0: 全部分区, 1: 动画, 13: 番剧, 167: 国创, 3: 音乐, 129: 舞蹈, 4: 游戏, 36: 科技,
188: 数码, 160: 生活, 119: 鬼畜, 155: 时尚, 165: 广告, 5: 娱乐, 181: 影视, 177: 纪录片,
23: 电影, 11: 电视剧}
'''
url = 'https://search.bilibili.com/all'
Info = collections.namedtuple('Info',
('av', 'title', 'duration', 'playtime', 'date', 'author'))
params = dict(keyword=keyword, order=order, duration=duration, tids_1=typeid)
find_av = re.compile(r'(?<=av)\d+')
find_author = re.compile(r'\d+')
for page in pages:
params['page'] = page
response = requests.get(url, params=params, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
for card in soup.find_all(class_='video-item matrix'): # hard-code
part_1, part_2 = card.children
av = find_av.findall(part_1.attrs['href'])[0]
title = part_1.attrs['title']
duration = part_1.find(class_='so-imgTag_rb').text # hard-code
playtime = part_2.find(class_='so-icon watch-num').text.strip() # hard-code
date = part_2.find(class_='so-icon time').text.strip() # hard-code
up_name = part_2.find(class_='up-name') # hard-code
author = find_author.findall(up_name.attrs['href'])[0]
yield Info(av, title, duration, playtime, date, author)
if __name__ == '__main__':
keywords = '武汉', '新型冠状病毒', '医护人员', '新型肺炎'
pages = range(1, 50+1)
filename = 'bilibili.tsv'
with open(filename, 'w') as f:
columns = 'keyword', 'av', 'title', 'duration', 'playtime', 'date', 'author'
f.write('\t'.join(columns) + '\n')
for keyword in tqdm.tqdm(keywords):
for info in bilibili_search(keyword, pages):
f.write('\t'.join((keyword, )+info) + '\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment