Created
July 20, 2015 18:19
虾米音乐导出成kgl到网易音乐 含有artist title 和album的信息
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding=utf-8 | |
import requests | |
from pyquery import PyQuery as PQ | |
import urllib2 | |
import sys | |
from bs4 import BeautifulSoup | |
from multiprocessing import Pool | |
from functools import partial | |
import codecs | |
import re | |
HEADERS = {'User-Agent': 'Mozilla/5.0'} | |
def upload_img42(img): | |
url = 'http://img42.com' | |
req = urllib2.Request(url, data=img) | |
msg = urllib2.urlopen(req).read() | |
return '%s/%s' % (url, json.loads(msg)['id']) | |
def captcha(jpeg): | |
print '* captcha needed' | |
print 'captcha url:', upload_img42(jpeg) | |
foo = raw_input('captcha >') | |
return foo | |
def captcha_handler(session, html): | |
bs = BeautifulSoup(html) | |
captcha_url = bs.find('img')['src'] | |
input_session_id = bs.find('input', {'name': 'sessionID'})['value'] | |
input_apply = bs.find('input', {'name': 'apply'})['value'] | |
input_referer = bs.find('input', {'name': 'referer'})['value'] | |
header = {'Referer': input_referer, | |
'User-Agent': 'Mozilla/5.0'} | |
captcha_data = session.get(captcha_url, headers=header).content | |
code = captcha(captcha_data) | |
url = 'http://www.xiami.com/alisec/captcha/tmdgetv3.php' | |
data = {'code': code, | |
'sessionID': input_session_id, | |
'apply': input_apply, | |
'referer': input_referer} | |
ret = session.post(url, data=data, headers=header) | |
def get(session, *args, **kwargs): | |
while True: | |
try: | |
kwargs['headers'] = HEADERS | |
data = session.get(*args, **kwargs) | |
if data.status_code == 403: | |
sec = re.findall('<script>document.cookie="sec=(.*?);', data.content) | |
if len(sec) != 0: | |
session.cookies['sec'] = sec[0] | |
continue | |
except Exception as e: | |
# 失败重试 | |
print 'Error', e | |
continue | |
if 'regcheckcode.taobao.com' in data.content: | |
print 'Captcha needed.' | |
# 目前只是把整个requests.Response和session对象传给验证码处理函数, 函数需自行处理完后重试. | |
self.captcha_handler(session, data) | |
continue | |
return data | |
def get_stared_songs(uid, page): | |
url = 'http://www.xiami.com/space/lib-song/u/{uid}/page/{page}' | |
session = requests.session() | |
def _bs_func(bsobj): | |
foo = bsobj.findAll(attrs={'class': 'song_name'}) | |
if not foo: | |
return None | |
lst = [] | |
for i in foo: | |
lst.append(i.find('a')['href'].replace('http://www.xiami.com/song/', '')) | |
return lst | |
ret = BeautifulSoup(get(session, url.format(uid=uid, page=page)).content) | |
return _bs_func(ret) | |
def get_song_list(uid): | |
pool = Pool(20) | |
session = requests.session() | |
foo = get(session, 'http://www.xiami.com/space/lib-song/u/%s/' % uid, headers=HEADERS).content | |
song_count = int(BeautifulSoup(foo).find('span', attrs={'class': 'counts'}).text[:-1]) | |
page_amount = song_count/25.0 | |
page_amount = int(page_amount) + 1 if page_amount > int(page_amount) else int(page_amount) | |
print 'Total pages:', page_amount | |
ret = pool.map(partial(get_stared_songs, uid), range(1, page_amount + 1)) | |
song_list = [] | |
map(lambda x: song_list.extend(x), ret) | |
return song_list | |
def get_song_info_task(song): | |
suber = re.compile('[&><"\']') | |
session = requests.session() | |
while True: | |
foo = session.get('http://www.xiami.com/song/%s' % song, headers={'User-Agent': 'Mozilla/5.0'}).content | |
if 'document.cookie="sec' in foo: | |
sec = re.findall('<script>document.cookie="sec=(.*?);', foo) | |
if len(sec) != 0: | |
session.cookies['sec'] = sec[0] | |
else: | |
raise Exception | |
continue | |
bar = PQ(foo) | |
artist = bar('[property="og:music:artist"]').attr('content') | |
title = bar('[property="og:title"]').attr('content') | |
album_name = bar('[property="og:music:album"]').attr('content') | |
artist = suber.sub('', artist).strip() | |
title = suber.sub('', title).strip() | |
album_name = suber.sub('', album_name).strip() | |
print '%s - %s - %s.mp3' % (artist, title, album_name) | |
return '%s - %s - %s.mp3' % (artist, title, album_name) | |
def get_song_infos(songs): | |
print 'Get song infos' | |
pool = Pool(30) | |
ret = pool.map(get_song_info_task, songs) | |
return ret | |
def save(song_infos): | |
ret = [] | |
templates = u'''<?xml version="1.0" encoding="utf-8"?> | |
<List ListName="Music list"> | |
{file_nodes} | |
</List> | |
''' | |
for i in song_infos: | |
ret.append(u''' <File> | |
<FileName>%s</FileName> | |
</File>''' % i.strip()) | |
f = codecs.open('xiami.kgl', mode='w', encoding='utf-8') | |
f.write(templates.format(file_nodes=u'\n'.join(ret))) | |
def main(): | |
try: | |
uid = int(sys.argv[1]) | |
except: | |
print 'Arg 1 must be user id' | |
sys.exit(1) | |
song_list = get_song_list(uid) | |
song_infos = get_song_infos(song_list) | |
save(song_infos) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment