Skip to content

Instantly share code, notes, and snippets.

@olsososo
Last active December 24, 2015 23:29
Show Gist options
  • Save olsososo/6880887 to your computer and use it in GitHub Desktop.
Save olsososo/6880887 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# import gevent.monkey
# gevent.monkey.patch_all()
import gevent
import MySQLdb
import os
import re
import requests
import shutil
import time
from BeautifulSoup import BeautifulSoup
from xml.dom.minidom import parseString
from xml.parsers.expat import ExpatError
from gevent.queue import Empty, Queue
from urlparse import urlparse, parse_qs
class C(object):
def __init__(self,url):
self.url = url
self.insert()
def spider(self):
r = requests.get(self.url)
text = r.text.encode('utf-8','//ignore')
p = re.compile(r'<a style="text-decoration: none; color: rgb\(0, 0, 0\); " rel="nofollow" href="(.*?)" >(.*?)</a>')
f = open("rss.txt",'a')
for m in p.finditer(text):
print m.group(2)+m.group(1)
f.write(m.group(2)+" "+m.group(1)+"\n")
f.close()
def insert(self):
f = open("rss.txt",'r')
conn = MySQLdb.connect(host='localhost',user='root',passwd='',db='r',port=3306,charset="utf8")
cur = conn.cursor()
oid = 0
id = 0
for l in f.readlines():
if l.strip().isdigit():
cid = l.strip()
else:
n,u = l.split(" ",1)
try:
r = requests.get(u.strip(),timeout=20)
except Exception as e:
continue
if r.status_code != 200:
continue
oid = oid+1
try:
text = r.text.encode('utf-8','//ignore')
c = re.compile(r'<description>([\s|\S]*?)</description>')
d = c.search(text).group(1)
if d.strip() == "":
c = re.compile(r'<title>([\s|\S]*?)</title>')
d = c.search(text).group(1)
except:
d = ""
c = re.compile(r'(http://.*?\.(com|cn|info|me|org|net|uk)/).*?')
domain = c.search(u).group(0)
try:
response = requests.get(domain+"favicon.ico", stream=True)
with open(os.getcwd()+"/icon/"+str(oid)+".ico", 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
del response
icon = "1@icon/"+str(oid)+".ico"
except Exception as e:
icon = ""
try:
cur.execute("insert into source (title,description,icon,url,cid,orderid,islock) values ('%s','%s','%s','%s','%s','%s','%s')" %(n.strip(),d.strip(),icon,u.strip(),cid,oid,0))
except Exception as e:
print cur._executed
class R(object):
def __init__(self,url):
self.url = url
def category():
r = requests.get("http://gate.guokr.com/")
soup = BeautifulSoup(r.content)
categorys_hd = soup.findAll("div",{"class":"categorys-hd fix"})
f = open("categories.txt",'w')
for item in categorys_hd:
category_soup = BeautifulSoup(item.prettify())
category = category_soup.findAll("div",{"class":"category"})
f.write("P:\n")
for c in category:
s = BeautifulSoup(c.prettify())
titles = s.findAll("h3")
for title in titles:
f.write("S:"+title.text.encode('utf-8','//ignore')+"\n")
sites = s.findAll("li")
for site in sites:
f.write(site.text.encode('utf-8','//ignore')+" "+site.a['href'].encode('utf-8')+"\n")
f.close()
def findRss():
url = "http://www.domain.com"
r = requests.get(url)
soup = BeautifulSoup(r.content)
a = soup.find("a",href = re.compile("(rss|atom|feed|xml)",re.I))
if a is not None:
if a['href'].startswith('/'):
if url.endswith('/'):
url = url[0:-1]
href = url+a['href']
else:
href = a['href']
print href
def xml():
url = 'http://www.domain.com/index.xml'
r = requests.get(url)
try:
parseString(r.content)
return True
except ExpatError:
return False
class F(object):
def __init__(self,url,filePath,thread_num=5,timeout=5):
self.url = url
self.queue = Queue()
self.timeout = timeout
self.file = open(filePath,'w')
self.jobs = [gevent.spawn(self.doScheduler)]
self.jobs.extend([gevent.spawn(self.doWorker) for i in xrange(thread_num)])
self.start()
def start(self):
gevent.joinall(self.jobs)
def doScheduler(self):
r = requests.get(self.url,timeout=10)
soup = BeautifulSoup(r.content)
sidebar = soup.find('div',{'id':'sidebar'})
sidebar = BeautifulSoup(sidebar.prettify())
more = sidebar.findAll('a',{'href':re.compile('more.php\?id=\d*')})
categoryid = 1
for m in more:
href = self.url+str(m['href'])
r = requests.get(href,timeout=10)
soup = BeautifulSoup(r.content)
navi = soup.find('div',{'id':'navi'})
navi = BeautifulSoup(navi.prettify())
try:
pagesize = navi('a')[-2]
except IndexError:
pagesize = navi('a')[-1]
except Exception as e:
print e
continue
for i in xrange(int(pagesize.text)):
item = categoryid,m.text.replace('&amp;','&'),re.sub('page=\d*','page='+str(i+1),self.url+pagesize['href'])
self.queue.put(item)
categoryid = categoryid + 1
def doWorker(self):
while True:
try:
categoryid,category,url = self.queue.get(timeout=self.timeout)
r = requests.get(url,timeout=15)
soup = BeautifulSoup(r.content)
posts = soup.findAll('div',{'class':'post'})
for post in posts:
soup = BeautifulSoup(post.prettify())
title = soup.find('h2').text
site = soup('a')[-1]['href']
source = soup.find('a',href=re.compile('^http://findex.cn/subscripe.php'))
o = urlparse(source['href'])
params = parse_qs(o.query)
url = params['url'][0]
self.file.write(category.encode('utf-8').strip()+'###'+title.encode('utf-8').strip()+'###'+url.encode('utf-8').strip()+'###'+site.encode('utf-8').strip()+"\n")
print url
except Empty:
self.file.close()
return
except Exception as e:
print e
class SoureHandler(object):
def __init__(self,file_path):
self.file = file_path
self.DB_Session = DB_Session()
self.pid = 0
self.categoryid = 0
self.bucket_name = bucket_name
self.uptoken = uptoken
self.worker()
def worker(self):
f = open(self.file,'r')
for line in f.readlines():
if line.startswith('C:'):
_, category_title = line.split(':')
category = Category(pid=0,title=category_title.strip(),description='',photo='',orderid=0,islock=0)
self.DB_Session.add(category)
self.DB_Session.commit()
self.pid = category.id
elif line.startswith('S:'):
_,category_title = line.split('S:')
category = Category(pid=self.pid,title=category_title.strip(),description='',photo='',orderid=0,islock=0)
self.categoryid = self.DB_Session.add(category)
self.DB_Session.commit()
self.categoryid = category.id
else:
soure_title,url = line.split(' ',1)
soure_title = soure_title.strip()
url = url.strip()
link = self.findRss(url)
if link is not None and self.verifyRss(link) == True:
try:
r = requests.get(link,timeout=15)
po = re.compile(r'<description>([\s|\S]*?)</description>')
pt = re.compile(r'<title>([\s|\S]*?)</title>')
description = po.search(r.content).group(1) or pt.search(r.content).group(1)
except:
description = ""
if url.endswith('/') == False: url = str(url) + '/'
try:
response = requests.get(url+"favicon.ico", stream=True)
key = "categoryid"+str(self.categoryid)+'_'+str(int(time.time()*100000))+'.ico'
ret, err = qiniu.io.put(self.uptoken, key, data=response.raw)
if err is not None:
print err
return
icon = "http://%s.u.qiniudn.com/%s" % (self.bucket_name, key)
except Exception:
icon = ""
source = Source(title=soure_title,description=description.strip(),icon=icon,url=link,cid=self.categoryid,orderid=0,islock=0)
self.DB_Session.add(source)
self.DB_Session.commit()
def findRss(self,url):
link = None
try:
r = requests.get(url,timeout=15)
except Exception:
return None
soup = BeautifulSoup(r.content)
a = soup.find("a",href = re.compile("(rss|atom|feed|xml)"))
if a is not None:
if a['href'].startswith('/'):
if url.endswith('/'):
url = url[0:-1]
link = str(url)+a['href']
else:
link = a['href']
return link
def verifyRss(self,url):
try:
r = requests.get(url,timeout=15)
parseString(r.content)
return True
except Exception:
return False
if __name__ == "__main__":
f = F('http://findex.cn/','rss.txt')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment