Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@advancedxy
Last active August 29, 2015 14:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save advancedxy/923cb0c71399d96565c7 to your computer and use it in GitHub Desktop.
Save advancedxy/923cb0c71399d96565c7 to your computer and use it in GitHub Desktop.
advancedxy.com's post related code
#!/usr/bin/python2
#coding=utf-8
import os
import urllib2
import zipfile
import zlib
from urlparse import urlparse
from time import strftime,localtime
def initsites(filename):
try:
file = open(filename,'rU')
returns = []
returnone = []
for i in file.readlines():
if i != '\n':
j = 0
i = i[:-1]
if i[0:4] == '====':
i = i[4:-4]
if i[0:13] == 'getting into ':
i = i[13:]
returnone = returnone + [i]
else :
if j == 0:
returnone = [returnone]
j = j+1
returns = returns+returnone
returnone = []
return returns
finally:
file.close()
def dividesites(sites,f1='/media/virtual/srtp/2items',f2='/media/virtual/srtp/3items',f3='/media/virtual/srtp/4items',f4='/media/virtual/srtp/5items',f5="/media/virtual/srtp/moreitems"):
try:
file1 = open(f1,'w')
file2 = open(f2,'w')
file3 = open(f3,'w')
file4 = open(f4,'w')
file5 = open(f5,'w')
for i in sites:
s = ''
for j in i:
s = s+j+'\n'
s = s+'\n'
if len(i) == 2 :
file1.write(s)
elif len(i) == 3 :
file2.write(s)
elif len(i) == 4 :
file3.write(s)
elif len(i) == 5 :
file4.write(s)
elif len(i) >= 6:
file5.write(s)
else :
print i
print 'There is something wrong with your sites, please check it!'
finally:
file1.close()
file2.close()
file3.close()
file4.close()
file5.close()
def mkdir(dirname):
if not os.path.exists(dirname) :
os.mkdir(dirname)
else:
if os.path.isfile(dirname):
os.remove(dirname)
os.mkdir(dirname)
def downallxml(sites):
for i in sites:
o = urlparse(i[1])
dirname = o.netloc
print dirname
url = i[3]
print url
currentdir = os.getcwd()
mkdir(dirname)
os.chdir(dirname)
print os.getcwd()
filename = strftime("%y%m%d",localtime())+'_all'
if not os.path.exists(filename):
f = open(filename,'w')
try:
sock = urllib2.urlopen(url)
xml = sock.read()
except:
sock.close()
print "we have some problem with "+url+"! please check it !"
sock.close()
f.write(xml)
f.close()
os.chdir(currentdir)
def zipfiles(sites):
zipFile = zipfile.ZipFile(strftime("%y%m%d",localtime())+'.zip','w',zipfile.ZIP_DEFLATED)
for i in sites:
o = urlparse(i[1])
dirname = o.netloc
filename = strftime("%y%m%d",localtime())+'_all'
zipFile.write(os.path.join(dirname,filename))
zipFile.close()
if __name__ == '__main__':
file = 'api'
site = initsites(file)
#print site4
downallxml(site)
zipfiles(site)
#!/usr/bin/env python2
#coding=utf-8
import os
import urllib2
from pyquery import PyQuery as pyq
from urlparse import urlparse,urljoin
from downxml import initsites as init
from downxml import dividesites as divide
file = '/media/virtual/srtp/sites.xml'
pattern = ['baidu', 'hao123','123','百度','api']
def analyse(addr):
try:
sock = pyq(url=addr)
except:
pass
site = pyq(addr)
o = urlparse(addr)
baseurl = o.scheme+"://"+o.netloc
urls = []
for i in pattern:
flag = 0
for j in site.find("a").parent():
if i in pyq(j).text().lower().encode("utf-8"):
urls += [pyq(j).find("a").attr("href")]
flag = 1
if flag == 1:
break
urls = list(set(urls)) #delete the same element
urls = [ urljoin(baseurl,i) if baseurl in urljoin(baseurl,i) else i for i in urls ]
print urls
return urls
if __name__ == '__main__':
sites = init('/media/virtual/srtp/sites.xml')
divide(sites)
sites = init('/media/virtual/srtp/3items')
f = ''
api=open("/media/virtual/srtp/api",'w')
for i in sites:
addr = i[2]
urls = analyse(addr)
i.extend(urls)
for j in i:
api.write(j+'\n')
api.write('\n')
api.close()
#!/usr/bin/python2
import codecs
from pyquery import PyQuery as pyq
from urlparse import urljoin
sock = pyq(url='http://tuan.baidu.com/allsite.php')
site = sock('.site-span')
filepath = "/media/virtual/srtp/sites.xml"
xml = open(filepath,'r+')
for i in site:
f = ''
s1 = '===='+pyq(i).find('a').eq(0).text()+'===='
print s1
xml.write(s1.encode("utf-8")+'\n')
f = f+s1+'\n'
urls = pyq(i).find('a').eq(0).attr("href")
if urls[-1] == '/':
urls = urls[:-1]
f = f+urls+'\n'
xml.write(urls+'\n')
print "getting into %s" %(urls,)
try:
doc = pyq(url=urls,parser='html')
for j in doc.find('a'):
if "api" in pyq(j).text().lower():
href = pyq(j).attr("href")
if "http" in href:
href = href
else :
print href[0]
href = urljoin(urls,href)
print href
xml.write(href+'\n')
#f = f+href+'\n'
except:
pass
print '\n'
xml.write('\n')
#xml = open(filepath,'r+')
#xml.write(f)
xml.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment