Created
January 29, 2020 07:13
-
-
Save iydon/01ed625bbd30b5c34519cc4eb69338e3 to your computer and use it in GitHub Desktop.
Bilibili 搜索
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import collections | |
import json | |
import re | |
import requests | |
import tqdm | |
from bs4 import BeautifulSoup | |
def bilibili_search(keyword, pages=[1], order='', duration=0, typeid=0, headers=dict()): | |
'''Bilibili 综合搜索 | |
:Argument: | |
- keyword | |
- pages: Iterable | |
- order: {totalrank: 综合排序, click: 最多点击, pubdate: 最新发布, dm: 最多弹幕, stow: 最多收藏} | |
- duration: {0: 全部时长, 1: 10分钟以下, 2: 10-30分钟, 3: 30-60分钟, 4: 60分钟以上} | |
- typeid: {0: 全部分区, 1: 动画, 13: 番剧, 167: 国创, 3: 音乐, 129: 舞蹈, 4: 游戏, 36: 科技, | |
188: 数码, 160: 生活, 119: 鬼畜, 155: 时尚, 165: 广告, 5: 娱乐, 181: 影视, 177: 纪录片, | |
23: 电影, 11: 电视剧} | |
''' | |
url = 'https://search.bilibili.com/all' | |
Info = collections.namedtuple('Info', | |
('av', 'title', 'duration', 'playtime', 'date', 'author')) | |
params = dict(keyword=keyword, order=order, duration=duration, tids_1=typeid) | |
find_av = re.compile(r'(?<=av)\d+') | |
find_author = re.compile(r'\d+') | |
for page in pages: | |
params['page'] = page | |
response = requests.get(url, params=params, headers=headers) | |
soup = BeautifulSoup(response.text, 'lxml') | |
for card in soup.find_all(class_='video-item matrix'): # hard-code | |
part_1, part_2 = card.children | |
av = find_av.findall(part_1.attrs['href'])[0] | |
title = part_1.attrs['title'] | |
duration = part_1.find(class_='so-imgTag_rb').text # hard-code | |
playtime = part_2.find(class_='so-icon watch-num').text.strip() # hard-code | |
date = part_2.find(class_='so-icon time').text.strip() # hard-code | |
up_name = part_2.find(class_='up-name') # hard-code | |
author = find_author.findall(up_name.attrs['href'])[0] | |
yield Info(av, title, duration, playtime, date, author) | |
if __name__ == '__main__': | |
keywords = '武汉', '新型冠状病毒', '医护人员', '新型肺炎' | |
pages = range(1, 50+1) | |
filename = 'bilibili.tsv' | |
with open(filename, 'w') as f: | |
columns = 'keyword', 'av', 'title', 'duration', 'playtime', 'date', 'author' | |
f.write('\t'.join(columns) + '\n') | |
for keyword in tqdm.tqdm(keywords): | |
for info in bilibili_search(keyword, pages): | |
f.write('\t'.join((keyword, )+info) + '\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment