Created
June 1, 2015 08:18
-
-
Save ihciah/5dcfae3e34cf9b295f5b to your computer and use it in GitHub Desktop.
两个无聊的小脚本
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
#过滤最近1月发帖并且12级的用户 | |
import urllib,urllib2,re | |
from multiprocessing import Pool | |
from multiprocessing.dummy import Pool as ThreadPool | |
URLBASE='http://tieba.baidu.com/f?kw=%E5%8D%8E%E4%B8%9C%E7%90%86%E5%B7%A5%E5%A4%A7%E5%AD%A6&ie=utf-8&pn=' | |
result=set() | |
realres=[] | |
pool = ThreadPool(10) | |
def conp(s): | |
return re.findall(r'&ie=utf-8&fr=frs" target="_blank">(.+?)</a>',s) | |
def getlevel(s): | |
pram=urllib.urlencode({'un':s,'ie':'utf-8','fr':'frs'}) | |
url='http://tieba.baidu.com/home/main/?'+pram | |
pres=urllib2.urlopen(url).read() | |
return re.findall(u'<span>华东理工大学</span><span class="forum_level (.+?)">'.encode('GBK'),pres) | |
def doit(i): | |
mk=getlevel(i) | |
if len(mk)>0: | |
print mk[0],i | |
if mk[0]=='lv12': | |
realres.append(i) | |
for i in range(0,2050,50): | |
res=urllib2.urlopen(URLBASE+str(i)).read() | |
result|=set(conp(res)) | |
print i,len(result) | |
l=list(result) | |
pool.map(doit,l) | |
pool.close() | |
pool.join() | |
print realres |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
#输出指定文件中用户的关注列表 | |
import urllib,urllib2,re | |
f=open('D:/in.txt','r') | |
l=f.readlines() | |
for i in l: | |
print i | |
m=i.replace('\n','').replace('\r','').replace(' ','') | |
pram=urllib.urlencode({'un':m,'ie':'utf-8','fr':'frs'}) | |
url='http://tieba.baidu.com/home/main/?'+pram | |
pres=urllib2.urlopen(url).read() | |
print i | |
for t in re.findall(r'class="u-f-item unsign"><span>(.+?)</span>',pres): | |
print t.decode('GBK') | |
print '------------' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment