Skip to content

Instantly share code, notes, and snippets.

@iamued
Last active June 6, 2016 16:24
Show Gist options
  • Save iamued/8429889 to your computer and use it in GitHub Desktop.
Save iamued/8429889 to your computer and use it in GitHub Desktop.
Cnzz api with python
__author__ = 'richie'
# -*- coding: utf-8 -*-
import urllib, urllib2, cookielib, re, os, codecs, time, subprocess,sys,json
#import simplejson as json
import dateutil
class cnzz(object):
def __init__(self, username = '', password = '',othername=''):
self.__username = username
self.__othername = othername
self.__password = password
self.__opener = ''
self.__sitelist = []
def login(self):
myCookie = urllib2.HTTPCookieProcessor(cookielib.CookieJar());
self.__opener = urllib2.build_opener(myCookie)
post_data = {
'username': self.__username,
'password': self.__password
}
req = urllib2.Request('http://new.cnzz.com/user/login.php', urllib.urlencode(post_data))
#print 'var _username = "'+self.__username+'";'
loginhtml= self.__opener.open(req).read()
#print loginhtml
if(loginhtml.find('_username') > 1):
#self.__opener=opener
return True
else:
return False
def getSiteListPageCount(self):
url="http://new.cnzz.com/v1/main.php?s=site_list&sort=0&everypage=30&setpage"
req=urllib2.Request(url)
html= self.__opener.open(req).read().decode('gbk').encode('utf-8')
#print html
match=re.compile(r'第1/(?P<pagecount>\d+?)页 ').search(html)
if match:
#print match.group('pagecount')
return int(match.group('pagecount'))
#html.find('第1/3页')
#html=self.__opener(urllib2.Request(url)).read()
#print html
def getSiteList(self):
pagecount=self.getSiteListPageCount()
print "count page :"+str(pagecount)
for i in range(1,pagecount+1):
url="http://new.cnzz.com/v1/main.php?s=site_list&sort=0&page="+str(i)+"&everypage=30"
#print url
req=urllib2.Request(url)
html= self.__opener.open(req).read().decode('gbk').encode('utf-8')
#print html
match2=re.compile(r'<script>site_data\(\'(.+)\'\);</script>').findall(html)
match3=re.compile(r'<div class="col-1">(?P<sitename>.+)').findall(html)
if(match2 and match3):
#print match2
#print len(match2)
#print len(match3)
#print match3
for i in range(0,len(match3)):
print match3[i][0:-1]+"@@@"+match2[i]
self.__sitelist.append(match3[i][0:-1]+"@@@"+match2[i])
else:
print 'getSiteList Error'
sys.exit()
print 'getSiteList OK'
return self.__sitelist
def yesterdayinfo(self,siteid=''):
if(siteid != ''):
url ="http://new.cnzz.com/v1/data/site_list_data.php?siteid="+siteid
req=urllib2.Request(url)
try:
html= self.__opener.open(req).read().decode('gbk').encode('utf-8')
except :
return self.yesterdayinfo(siteid)
#print siteid+'ok'
#data=eval(html)[1]
data=json.loads(html)
#print data
return data
#data=json.loads("{"+html+"}")
#print data
else:
print 'no siteid'
def getSiteInfoByDate(self,siteid='',startdate='',enddate=''):
#url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=timeflux&st=" + startdate + "&et=" + enddate
url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=timeflux&st=" + startdate + "&et=" + enddate
req=urllib2.Request(url)
html= self.__opener.open(req).read().decode('gbk').encode('utf-8')
siteinfo={}
for i in reversed(dateutil.getDays(startdate, enddate)):
pn = re.compile(
r'<td>' + i + '</td>\s+<td class="num1">(?P<pv>.+)</td>\s+<td class="num1">(?P<uv>.+)</td>\s+<td class="num1">(?P<ip>.+)</td>'
, re.I)
mn = pn.search(html)
#print html_src2.decode('gbk').encode('utf-8')
#sitesinfo[key][i] = mn.group('uv')
#print i+"uv:"+mn.group('uv')
if(mn):
siteinfo[i.replace('星期六','').replace('星期天','')]=[mn.group('pv'),mn.group('uv'),mn.group('ip')]
#print siteinfo
return siteinfo
def getHistoryKeyByDate(self,siteid='',startdate='',enddate='',page=1):
url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=key&st=" + startdate + "&et=" + enddate
print url
req=urllib2.Request(url)
html= self.__opener.open(req).read().decode('gbk').encode('utf-8')
match=re.compile(r'第1/(?P<pagecount>\d+?)页 ').search(html)
#pagecount=1
if match:
#print match.group('pagecount')
pagecount= int(match.group('pagecount'))
# match2=re.compile(r'<td title=\'(.+)\'>').findall(html)
#for i in range(0,len(match2)):
# print match2[i]
pagecount=3
keyinfos=[]
for i in range(1,pagecount+1):
print '正在抓取关键词列表第'+str(i)+'页'
url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=key&st=" + startdate + "&et=" + enddate +"&page="+str(i)
req=urllib2.Request(url)
html= self.__opener.open(req).read().decode('gbk','ignore').encode('utf-8')
mt=re.compile(r'<td title=\'(?P<key>.+) \'>(?P<key1>.+)</td>\s+<td class=\'all_right\'>(?P<snum>.+)</td>\s+<td class=\'all_right\'>(?P<uv>.+)</td>')
mn=mt.findall(html)
for i in range(0,len(mn)):
#print mn[i][0],'\t',mn[i][2],'\t',mn[i][3]
keyinfos.append([mn[i][0],mn[i][2],mn[i][3]])
'''
<td title='www.jxeea.cn '>www.jxeea.cn</td>
<td class='all_right'>38024</td>
<td class='all_right'>29859</td>
<td class='all_right'>28771</td>
<td class='all_right'>24385</td>
'''
print '共抓取关键词'+str(len(keyinfos))+'个'
return keyinfos
#print html
def getKeyHistory(self,siteid='',startdate='',enddate='',key=''):
#通过key查询时间段内最高,和最低的搜索量
#http://new.cnzz.com/v1/main.php?siteid=2918556&s=history_key&st=2013-04-08&et=2013-05-07&t=30&url=%C9%EE%DB%DA%BB%E1%BC%C6%B4%D3%D2%B5%D7%CA%B8%F1%BF%BC%CA%D4%B1%A8%C3%FB%CF%B5%CD%B3
url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=history_key&st=" + startdate + "&et=" + enddate+"&url="+urllib2.quote(key.decode('utf-8').encode('gbk'))
#print url
req=urllib2.Request(url)
html= self.__opener.open(req).read().decode('gbk').encode('utf-8')
#print html
# match2=re.compile(r'<td title=\'(.+)\'>').findall(html)
#for i in range(0,len(match2)):
# print match2[i]
#mt=re.compile(r'<td title=\'(?P<key>.+) \'>(?P<key1>.+)</td>\s+<td class=\'all_right\'>(?P<snum>.+)</td>\s+<td class=\'all_right\'>(?P<uv>.+)</td>')
#mn=mt.findall(html)
datelist=reversed(dateutil.getDays2(startdate, enddate))
uvlist={}
for i in datelist:
#print i
pn = re.compile(
r'<td> '+i+' </td>\s+<td class="num1">(?P<snum>.+)</td>\s+<td class="num1">(?P<uv>.+)</td>'
#\s+<td class="num1">(?P<uv>.+)</td>\s+<td class="num1">(?P<ip>.+)</td>\s+<td class="num2">(?P<newuv>.+)</td>
, re.I)
mn = pn.search(html)
#print html_src2.decode('gbk').encode('utf-8')
#sitesinfo[key][i] = mn.group('uv')
#print i+"uv:"+mn.group('uv')
#print mn
if(mn):
#siteinfo[i.replace('星期六','').replace('星期天','')]=
#print i,'\t',mn.group('snum'),'uv:',mn.group('uv')
uvlist[i.replace('星期六','').replace('星期天','')]=int(mn.group('uv'))
#print uvlist
#print 'max:',self.sort_by_value(uvlist)[-1],'min:',self.sort_by_value(uvlist)[0]
return [self.sort_by_value(uvlist)[-1],self.sort_by_value(uvlist)[0]]
def sort_by_value(self,d):
return sorted(d.items(), key=lambda d:d[1])
'''
if __name__ == '__main__':
CnzzTool=cnzz('username','password','')
if(CnzzTool.login()):
print "LoginOk"
else:
print "LoginError"
'''
__author__ = 'richie'
# -*- coding: utf-8 -*-
import urllib, urllib2, cookielib, re, os, codecs, time, subprocess,sys,json,math
#import simplejson as json
import dateutil
class cnzz(object):
def __init__(self, username = '', password = '',othername=''):
self.__username = username
self.__othername = othername
self.__password = password
self.__opener = ''
self.__sitelist = []
def login(self):
myCookie = urllib2.HTTPCookieProcessor(cookielib.CookieJar());
self.__opener = urllib2.build_opener(myCookie)
post_data = {
'username': self.__username,
'password': self.__password
}
req = urllib2.Request('http://new.cnzz.com/user/login.php', urllib.urlencode(post_data))
#print 'var _username = "'+self.__username+'";'
loginhtml= self.__opener.open(req).read()
#print loginhtml#.decode('gbk').encode('utf-8')
if(loginhtml.find('_username') > 1 or loginhtml.find('登陆进入旧版站长')>1):
#self.__opener=opener
return True
else:
return False
def getSiteListPageCount(self):
url="http://tongji.cnzz.com/main.php?c=site&a=show&ajax=module=gettotallist|module=list|module=isOpenTongji&sort=1&search=&currentPage=1&pageType=30&_="+str(int(time.time()))
req=urllib2.Request(url)
html= self.__opener.open(req).read().decode('gbk').encode('utf-8')
totalsite= int(json.loads(html)['data']['gettotallist']['totalsite'])
#print (155/90.0)
return math.ceil(totalsite/90.0)
#print html
#match=re.compile(r'第1/(?P<pagecount>\d+?)页 ').search(html)
#if match:
#print match.group('pagecount')
# return int(match.group('pagecount'))
#html.find('第1/3页')
#html=self.__opener(urllib2.Request(url)).read()
#print html
def getuserdetail(self,url):
req=urllib2.Request(url)
html= self.__opener.open(req).read().decode('gbk').encode('utf-8')
sitelist=json.loads(html)
print html
return sitelist
def getSiteList(self):
pagecount=int(self.getSiteListPageCount())
print "count page :"+str(pagecount)
for i in range(1,pagecount+1):
#url="http://new.cnzz.com/v1/main.php?s=site_list&sort=0&page="+str(i)+"&everypage=30"
url="http://tongji.cnzz.com/main.php?c=site&a=show&ajax=module=gettotallist|module=list|module=isOpenTongji&sort=1&search=&currentPage="+str(i)+"&pageType=90&_=1385011097947"
#print url
req=urllib2.Request(url)
html= self.__opener.open(req).read().decode('gbk').encode('utf-8')
sitelist=json.loads(html)
sitelist=sitelist['data']['list']['items']
for x in range(len(sitelist)):
#print sitelist[x]['name']+"@@@"+sitelist[x]['siteid']
#print x
self.__sitelist.append(sitelist[x]['name']+"@@@"+sitelist[x]['siteid'])
#print html
#match2=re.compile(r'<script>site_data\(\'(.+)\'\);</script>').findall(html)
#match3=re.compile(r'<div class="col-1">(?P<sitename>.+)').findall(html)
#if(match2 and match3):
# #print match2
# #print len(match2)
# #print len(match3)
# #print match3
# for i in range(0,len(match3)):
# print match3[i][0:-1]+"@@@"+match2[i]
# self.__sitelist.append(match3[i][0:-1]+"@@@"+match2[i])
#else:
# print 'getSiteList Error'
# sys.exit()
print 'getSiteList OK'
return self.__sitelist
def yesterdayinfo(self,siteid=''):
if(siteid != ''):
#url ="http://new.cnzz.com/v1/data/site_list_data.php?siteid="+siteid
url="http://tongji.cnzz.com/main.php?c=site&a=show&ajax=module=gettotallist|module=list|module=isOpenTongji&sort=1&search="+siteid+"&currentPage=1&pageType=90&_=1385012521584"
req=urllib2.Request(url)
try:
html= self.__opener.open(req).read().decode('gbk').encode('utf-8')
except :
return self.yesterdayinfo(siteid)
siteinfo= json.loads(html)['data']['list']['items']
siteyinfo=[(),()]
if(len(siteinfo)==1):
#print siteinfo[0]['y_uv']
siteyinfo[1]=[siteinfo[0]['y_pv'],siteinfo[0]['y_uv'],siteinfo[0]['y_ip']]
#print siteyinfo
#print siteid+'ok'
#data=eval(html)[1]
#data=json.dumps(html)
#print data
#print data
return list(siteyinfo)
#data=json.loads("{"+html+"}")
#print data
else:
print 'no siteid'
def getSiteInfoByDate(self,siteid='',startdate='',enddate=''):
#url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=timeflux&st=" + startdate + "&et=" + enddate
#url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=timeflux&st=" + startdate + "&et=" + enddate
url ="http://tongji.cnzz.com/main.php?c=flow&a=trend&ajax=module%3Dsummary%7Cmodule%3DfluxList_currentPage%3D1_pageType%3D90&siteid="+siteid+"&st="+ startdate +"&et="+ enddate+"&_=1385013202955"
req=urllib2.Request(url)
html= self.__opener.open(req).read().decode('gbk').encode('utf-8')
siteinfoitems= json.loads(html)['data']['fluxList']['items']
siteinfo={}
for x in range(len(siteinfoitems)):
print siteinfoitems[x]['key']
siteinfo[siteinfoitems[x]['key']]=[siteinfoitems[x]['pv'],siteinfoitems[x]['uv'],siteinfoitems[x]['ip']]
#print siteinfo
#exit()
return siteinfo
def getHistoryKeyByDate(self,siteid='',startdate='',enddate='',page=1):
url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=key&st=" + startdate + "&et=" + enddate
print url
req=urllib2.Request(url)
html= self.__opener.open(req).read().decode('gbk').encode('utf-8')
match=re.compile(r'第1/(?P<pagecount>\d+?)页 ').search(html)
#pagecount=1
if match:
#print match.group('pagecount')
pagecount= int(match.group('pagecount'))
# match2=re.compile(r'<td title=\'(.+)\'>').findall(html)
#for i in range(0,len(match2)):
# print match2[i]
pagecount=3
keyinfos=[]
for i in range(1,pagecount+1):
print '正在抓取关键词列表第'+str(i)+'页'
url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=key&st=" + startdate + "&et=" + enddate +"&page="+str(i)
req=urllib2.Request(url)
html= self.__opener.open(req).read().decode('gbk','ignore').encode('utf-8')
mt=re.compile(r'<td title=\'(?P<key>.+) \'>(?P<key1>.+)</td>\s+<td class=\'all_right\'>(?P<snum>.+)</td>\s+<td class=\'all_right\'>(?P<uv>.+)</td>')
mn=mt.findall(html)
for i in range(0,len(mn)):
#print mn[i][0],'\t',mn[i][2],'\t',mn[i][3]
keyinfos.append([mn[i][0],mn[i][2],mn[i][3]])
'''
<td title='www.jxeea.cn '>www.jxeea.cn</td>
<td class='all_right'>38024</td>
<td class='all_right'>29859</td>
<td class='all_right'>28771</td>
<td class='all_right'>24385</td>
'''
print '共抓取关键词'+str(len(keyinfos))+'个'
return keyinfos
#print html
def getKeyHistory(self,siteid='',startdate='',enddate='',key=''):
#通过key查询时间段内最高,和最低的搜索量
#http://new.cnzz.com/v1/main.php?siteid=2918556&s=history_key&st=2013-04-08&et=2013-05-07&t=30&url=%C9%EE%DB%DA%BB%E1%BC%C6%B4%D3%D2%B5%D7%CA%B8%F1%BF%BC%CA%D4%B1%A8%C3%FB%CF%B5%CD%B3
url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=history_key&st=" + startdate + "&et=" + enddate+"&url="+urllib2.quote(key.decode('utf-8').encode('gbk'))
#print url
req=urllib2.Request(url)
html= self.__opener.open(req).read().decode('gbk').encode('utf-8')
#print html
# match2=re.compile(r'<td title=\'(.+)\'>').findall(html)
#for i in range(0,len(match2)):
# print match2[i]
#mt=re.compile(r'<td title=\'(?P<key>.+) \'>(?P<key1>.+)</td>\s+<td class=\'all_right\'>(?P<snum>.+)</td>\s+<td class=\'all_right\'>(?P<uv>.+)</td>')
#mn=mt.findall(html)
datelist=reversed(dateutil.getDays2(startdate, enddate))
uvlist={}
for i in datelist:
#print i
pn = re.compile(
r'<td> '+i+' </td>\s+<td class="num1">(?P<snum>.+)</td>\s+<td class="num1">(?P<uv>.+)</td>'
#\s+<td class="num1">(?P<uv>.+)</td>\s+<td class="num1">(?P<ip>.+)</td>\s+<td class="num2">(?P<newuv>.+)</td>
, re.I)
mn = pn.search(html)
#print html_src2.decode('gbk').encode('utf-8')
#sitesinfo[key][i] = mn.group('uv')
#print i+"uv:"+mn.group('uv')
#print mn
if(mn):
#siteinfo[i.replace('星期六','').replace('星期天','')]=
#print i,'\t',mn.group('snum'),'uv:',mn.group('uv')
uvlist[i.replace('星期六','').replace('星期天','')]=int(mn.group('uv'))
#print uvlist
#print 'max:',self.sort_by_value(uvlist)[-1],'min:',self.sort_by_value(uvlist)[0]
return [self.sort_by_value(uvlist)[-1],self.sort_by_value(uvlist)[0]]
def sort_by_value(self,d):
return sorted(d.items(), key=lambda d:d[1])
'''
if __name__ == '__main__':
CnzzTool=cnzz('cnzzusername','password','mygod')
if(CnzzTool.login()):
print "LoginOk"
#CnzzTool.getSiteListPageCount()
#CnzzTool.getSiteList()
#print CnzzTool.yesterdayinfo('2918848')
#CnzzTool.getSiteInfoByDate('2918848','2012-05-20','2012-05-24')
else:
print "LoginError"
'''
@iamued
Copy link
Author

iamued commented Jan 15, 2014

老版本的cnzz api 已过期

@iamued
Copy link
Author

iamued commented Jan 15, 2014

新增新版本的 ~目前可用

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment