Skip to content

Instantly share code, notes, and snippets.

@ly0
Created July 20, 2015 18:19
虾米音乐导出成kgl到网易音乐 含有artist title 和album的信息
#coding=utf-8
import requests
from pyquery import PyQuery as PQ
import urllib2
import sys
from bs4 import BeautifulSoup
from multiprocessing import Pool
from functools import partial
import codecs
import re
HEADERS = {'User-Agent': 'Mozilla/5.0'}
def upload_img42(img):
url = 'http://img42.com'
req = urllib2.Request(url, data=img)
msg = urllib2.urlopen(req).read()
return '%s/%s' % (url, json.loads(msg)['id'])
def captcha(jpeg):
print '* captcha needed'
print 'captcha url:', upload_img42(jpeg)
foo = raw_input('captcha >')
return foo
def captcha_handler(session, html):
bs = BeautifulSoup(html)
captcha_url = bs.find('img')['src']
input_session_id = bs.find('input', {'name': 'sessionID'})['value']
input_apply = bs.find('input', {'name': 'apply'})['value']
input_referer = bs.find('input', {'name': 'referer'})['value']
header = {'Referer': input_referer,
'User-Agent': 'Mozilla/5.0'}
captcha_data = session.get(captcha_url, headers=header).content
code = captcha(captcha_data)
url = 'http://www.xiami.com/alisec/captcha/tmdgetv3.php'
data = {'code': code,
'sessionID': input_session_id,
'apply': input_apply,
'referer': input_referer}
ret = session.post(url, data=data, headers=header)
def get(session, *args, **kwargs):
while True:
try:
kwargs['headers'] = HEADERS
data = session.get(*args, **kwargs)
if data.status_code == 403:
sec = re.findall('<script>document.cookie="sec=(.*?);', data.content)
if len(sec) != 0:
session.cookies['sec'] = sec[0]
continue
except Exception as e:
# 失败重试
print 'Error', e
continue
if 'regcheckcode.taobao.com' in data.content:
print 'Captcha needed.'
# 目前只是把整个requests.Response和session对象传给验证码处理函数, 函数需自行处理完后重试.
self.captcha_handler(session, data)
continue
return data
def get_stared_songs(uid, page):
url = 'http://www.xiami.com/space/lib-song/u/{uid}/page/{page}'
session = requests.session()
def _bs_func(bsobj):
foo = bsobj.findAll(attrs={'class': 'song_name'})
if not foo:
return None
lst = []
for i in foo:
lst.append(i.find('a')['href'].replace('http://www.xiami.com/song/', ''))
return lst
ret = BeautifulSoup(get(session, url.format(uid=uid, page=page)).content)
return _bs_func(ret)
def get_song_list(uid):
pool = Pool(20)
session = requests.session()
foo = get(session, 'http://www.xiami.com/space/lib-song/u/%s/' % uid, headers=HEADERS).content
song_count = int(BeautifulSoup(foo).find('span', attrs={'class': 'counts'}).text[:-1])
page_amount = song_count/25.0
page_amount = int(page_amount) + 1 if page_amount > int(page_amount) else int(page_amount)
print 'Total pages:', page_amount
ret = pool.map(partial(get_stared_songs, uid), range(1, page_amount + 1))
song_list = []
map(lambda x: song_list.extend(x), ret)
return song_list
def get_song_info_task(song):
suber = re.compile('[&><"\']')
session = requests.session()
while True:
foo = session.get('http://www.xiami.com/song/%s' % song, headers={'User-Agent': 'Mozilla/5.0'}).content
if 'document.cookie="sec' in foo:
sec = re.findall('<script>document.cookie="sec=(.*?);', foo)
if len(sec) != 0:
session.cookies['sec'] = sec[0]
else:
raise Exception
continue
bar = PQ(foo)
artist = bar('[property="og:music:artist"]').attr('content')
title = bar('[property="og:title"]').attr('content')
album_name = bar('[property="og:music:album"]').attr('content')
artist = suber.sub('', artist).strip()
title = suber.sub('', title).strip()
album_name = suber.sub('', album_name).strip()
print '%s - %s - %s.mp3' % (artist, title, album_name)
return '%s - %s - %s.mp3' % (artist, title, album_name)
def get_song_infos(songs):
print 'Get song infos'
pool = Pool(30)
ret = pool.map(get_song_info_task, songs)
return ret
def save(song_infos):
ret = []
templates = u'''<?xml version="1.0" encoding="utf-8"?>
<List ListName="Music list">
{file_nodes}
</List>
'''
for i in song_infos:
ret.append(u''' <File>
<FileName>%s</FileName>
</File>''' % i.strip())
f = codecs.open('xiami.kgl', mode='w', encoding='utf-8')
f.write(templates.format(file_nodes=u'\n'.join(ret)))
def main():
try:
uid = int(sys.argv[1])
except:
print 'Arg 1 must be user id'
sys.exit(1)
song_list = get_song_list(uid)
song_infos = get_song_infos(song_list)
save(song_infos)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment