Skip to content

Instantly share code, notes, and snippets.

@qingfeng
Created August 4, 2008 06:58
Show Gist options
  • Save qingfeng/3867 to your computer and use it in GitHub Desktop.
Save qingfeng/3867 to your computer and use it in GitHub Desktop.
获取贴吧和音乐用户的差异
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import re
import pickle
import urllib2
def getStar(url,regx):
'''根据url获取明星'''
arr=[]
html=urllib2.urlopen(url).read()
c=re.compile(regx)
for title,name in c.findall(html):
name=name.decode("gbk").encode("utf8")
name=name.lower()
print "sina bar",name
arr.append(name)
return arr
def getData():
DATA_FILE="/tmp/music.out"
if os.path.isfile(DATA_FILE):
data=pickle.load(open(DATA_FILE))
barstar=data["bar"]
musicstar=data["music"]
else:
#Sina Bar
barstar=[]
barurl="http://bar.sina.com.cn/category.php?cid=3&start=%s"
barregx=r'''<td><a href="bar.php\?name=\S+" target="_blank" title="(\S+)">(\S+)</a></td>'''
for p in range(0,2520+120,120):
barstar.extend( getStar(barurl%p,barregx) )
#Sina Music
musicurl="http://music.sina.com.cn/yueku/singerlist/ar1st3.html"
musicregx=r'''<td width="20%" height="24" class="p_l11"><a href="/yueku/s/\d+.html" target="_blank" title="(\S+)">(\S+)</a></td>'''
musicstar=getStar(musicurl,musicregx)
data={'bar':barstar,'music':musicstar}
pickle.dump(data,open("/tmp/music.out","w"))
return musicstar,barstar
def main():
musicstar,barstar=getData()
f=set(musicstar)&set(barstar)
for p in f:
print p
print "len(barstar),len(musicstar)",len(barstar),len(musicstar)
print "set(musicstar)&set(barstar)",len(f)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment