Created
January 25, 2014 08:08
-
-
Save onetown/8613321 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: UTF-8 -*- | |
import os | |
import re | |
import logging | |
import random | |
import pymongo | |
import datetime | |
import tornado.httpserver | |
import tornado.options | |
import tornado.ioloop | |
import tornado.web | |
import tornado.httpclient | |
import tornado.escape | |
from tornado.httpclient import HTTPError | |
from tornado.options import define, options | |
from lxml import html as HTML | |
from lxml.html.soupparser import fromstring | |
from lxml.etree import tostring | |
from bson.objectid import ObjectId | |
import gutil | |
import ads | |
define("port", default=8000, help="run on the given port", type=int) | |
define("dbhost", default="localhost") | |
define("dbname", default="zzdocs") | |
define("debug", default=False) | |
class Application(tornado.web.Application): | |
def __init__(self): | |
handlers = [ | |
(r"/", IndexHandler), | |
(r"/404",NotFound), | |
(r"/download",DownloadHandler), | |
(r"/d/([^/]+)",DownloadHandler), | |
(r"/search",SearchHandler), | |
(r"/s/([^/]+)", SearchHandler), | |
(r"/s/([^/]+)/([^/]+)", SearchHandler), | |
(r"/magazine",MagzineIndex), | |
(r"/magazine/([^/]+)",MagzineIndex), | |
(r"/journal/([^/]+)", JournalHandler), | |
(r"/magazine/category/([^/]+)",MagzineCategoryHandler), | |
(r"/dmca.html",DmcaHandler), | |
(r".*",NotFound), | |
] | |
settings = dict( | |
debug = options.debug, | |
template_path=os.path.join(os.path.dirname(__file__),"templates"), | |
static_path=os.path.join(os.path.dirname(__file__),"static"), | |
) | |
self._support_doctypes = [ | |
("pdf","Acrobat PDF"), | |
("doc","Word 2003"), | |
("docx","Word 2007/2010"), | |
("xls", "Excel 2003"), | |
("xlsx", "Excel 2007/2010"), | |
("ppt","Powerpoint 2003"), | |
("pptx","Powerpoint 2007/2010"), | |
("pages","Apple pages"), | |
("numbers","Apple Numbers"), | |
("keynote","Apple Keynote") | |
] | |
self._bots = [ | |
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", | |
"Mediapartners-Google", | |
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)", | |
"Crawler", | |
"Baiduspider+(+http://www.baidu.com/search/spider.htm)", | |
"Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)", | |
"Python-urllib/2.4", | |
"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)", | |
"Sosospider+(+http://help.soso.com/webspider.htm)", | |
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" | |
] | |
tornado.web.Application.__init__(self, handlers, **settings) | |
self.static_keywords = [] | |
@property | |
def support_doctypes(self): | |
return self._support_doctypes | |
@property | |
def bots(self): | |
return self._bots | |
class BaseHandler(tornado.web.RequestHandler): | |
def get_connection(self): | |
conn = pymongo.Connection(options.dbhost) | |
return conn | |
def _connection(self): | |
return self.get_connection()[options.dbname] | |
def _userdb(self): | |
return self.get_connection()["userdb"] | |
def prepare(self): | |
self.keyword = "" | |
self.title = "" | |
self.metakeyword = "" | |
self.doctype = "pdf" | |
self.description = None | |
@property | |
def magzine_categories(self): | |
if not hasattr(self,'_magcates'): | |
self._magcates = [] | |
db = self._connection()['mag_categories'] | |
cates = db.find().sort("total",pymongo.DESCENDING) | |
for cate in cates: | |
self._magcates.append((cate['name'],cate["total"],str(cate['_id']) )) | |
return self._magcates | |
def get_current_user(self): | |
return None | |
def get_error_html(self,status_code, **kwargs): | |
print "404" | |
self.render("404.html") | |
def get_random_ad(self, size="300_250"): | |
adstr = None | |
ad = [] | |
if size == "300_250": | |
ad = ads.ad_300_250 | |
if len(ad) > 0: | |
adstr = ad[random.randint(0,len(ad)-1)] | |
return adstr | |
class IndexHandler(BaseHandler): | |
def get(self): | |
rand_keywords = [] | |
for i in range(18): | |
rand_keywords.append(self.application.static_keywords[random.randint(0,len(self.application.static_keywords)-1)]) | |
self.render("index.html",rand_keywords = rand_keywords) | |
class DmcaHandler(BaseHandler): | |
def get(self): | |
self.render("dmca.html") | |
class NotFound(BaseHandler): | |
def get(self): | |
self.render("404.html") | |
class DownloadHandler(BaseHandler): | |
def get(self,id=None): | |
tid = "" | |
if not id: | |
tid = self.get_argument("id","") | |
else: | |
tid = id | |
item = gutil.getItem(tid, self._connection()) | |
if not item: | |
'''raise http 404''' | |
raise HTTPError(404) | |
self.title = item["keyword"] | |
self.keyword = item["keyword"] | |
self.metakeyword = self.keyword | |
self.description = item["despstr"] | |
mk = [] | |
for a in item["relationkw"]: | |
for c in a: | |
el = HTML.fromstring(c) | |
mk.append(el.text_content()) | |
self.metakeyword += ",".join(mk) | |
self.render("download.html", item=item) | |
def post(self,id=None): | |
if not id: | |
raise HTTPError(404) | |
item = gutil.getItem(id,self._connection()) | |
if not item: | |
raise HTTPError(404) | |
name = self.get_argument("name","") | |
email = self.get_argument("email","") | |
message = tornado.escape.xhtml_escape(self.get_argument("message","")) | |
comment = dict( | |
id = id, | |
name = name, | |
email = email, | |
message = message, | |
date = datetime.datetime.utcnow() | |
) | |
fcol = self._connection()["documents"] | |
item["comments"].append(comment) | |
fcol.save(item) | |
self._userdb()["comments"].insert(comment) | |
rtn = dict(result="ok") | |
self.write(tornado.escape.json_encode(rtn)) | |
self.finish() | |
class SearchHandler(BaseHandler): | |
@tornado.web.asynchronous | |
def get(self,k=None,pageno=1): | |
self.pageno = 1 | |
self.doctype = self.get_argument("dt","pdf") | |
try: | |
self.pageno = int(pageno) | |
except: | |
pass | |
tk = self.get_argument("q","").replace('/','+') | |
if tk <> "": | |
self.redirect("/s/" + tk) | |
return | |
'''get search url''' | |
if not k or k=="": | |
#feel lucky | |
k = "test" #will get a random keyword | |
k = k.replace('/','+') | |
self.keyword = k | |
self.title = k | |
start = (self.pageno - 1) * 10 | |
gurl = gutil.getGUrl(k,t=self.doctype,start=start) | |
req = tornado.httpclient.HTTPRequest(gurl) | |
req.headers = tornado.httputil.HTTPHeaders(req.headers) | |
ua = self.request.headers["User-Agent"] | |
for agent in self.application.bots: | |
if agent == ua: | |
ua = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C; .NET4.0E)" | |
req.headers["User-Agent"] = ua | |
ci = tornado.httpclient.AsyncHTTPClient() | |
ci.fetch(req, self._onload) | |
def _onload(self,resp): | |
body = "" | |
try: | |
body = resp.body.decode("utf-8",'ignore') | |
except: | |
body = resp.body | |
page = gutil.parse_sr(body,kw=self.keyword, pn=self.pageno, db=self._connection()) | |
#self.write(HTML.tostring(doc,encoding="utf-8")) | |
#self.finish() | |
self.metakeyword = page["keywords"] | |
self.render("search.html",page=page) | |
class MagzineIndex(BaseHandler): | |
def get(self,magid=None): | |
filter = {} | |
limit = 15 | |
title = "The latest magzines" | |
category = None | |
if magid: | |
try: | |
mid = ObjectId(magid) | |
filter["magid"] = mid | |
limit = 0 | |
except: | |
pass | |
topbooks = self._connection()['books'].find(filter).sort("datetime",pymongo.DESCENDING).limit(limit) | |
if filter.has_key('magid'): | |
mag = self._connection()['magzines'].find_one({'_id':filter['magid']}) | |
if mag: | |
category = mag['category'] | |
books = [] | |
for tb in topbooks: | |
books.append(tb) | |
self.render("magzine.html", books= books, title=title, category=category) | |
class JournalHandler(BaseHandler): | |
def get(self, id=None): | |
if id: | |
try: | |
oid = None | |
if id.find('#') > -1: | |
od = id.split('#')[0] | |
else: | |
oid = id | |
book = self._connection().books.find_one({'_id': ObjectId(oid)}) | |
if not book.has_key("comments") or book["comments"] == None: | |
book["comments"] = [] | |
self.render("journal.html",item=book) | |
except Exception,e: | |
logging.error(e) | |
raise HTTPError(404) | |
else: | |
self.redirect("/magzine") | |
def post(self,id=None): | |
if not id: | |
raise HTTPError(404) | |
journal = None | |
try: | |
journal = self._connection().books.find_one({"_id":ObjectId(id)}) | |
except: | |
raise HTTPError(500) | |
if not journal: | |
raise HTTPError(404) | |
name = self.get_argument("name","") | |
email = self.get_argument("email","") | |
message = tornado.escape.xhtml_escape(self.get_argument("message","")) | |
comment = dict( | |
id = id, | |
name = name, | |
email = email, | |
message = message, | |
date = datetime.datetime.utcnow() | |
) | |
if not journal.has_key("comments"): | |
journal["comments"] = [] | |
journal["comments"].append(comment) | |
self._connection().books.save(journal) | |
self._userdb()["comments"].insert(comment) | |
rtn = dict(result="ok") | |
self.write(tornado.escape.json_encode(rtn)) | |
self.finish() | |
class MagzineCategoryHandler(BaseHandler): | |
def get(self, name=None): | |
filter = {} | |
categoryname = "" | |
if name: | |
categoryname = name | |
filter = {'category':name} | |
else: | |
categoryname = self.locale.translate("Magzines") | |
magcol = self._connection().magzines.find(filter, limit=24) | |
mags = [] | |
for m in magcol: | |
newj = self._connection().books.find_one({'magid':m['_id']},sort=[("datetime",pymongo.DESCENDING)]) #'coverimg.remote':{'$ne':''}}) | |
if not newj: | |
logging.error("dont find magzines in id %s", str(m['_id'])) | |
pass | |
else: | |
m['new'] = newj | |
mags.append(m) | |
self.render("category.html", magzines = mags, catename=categoryname) | |
def load_random_keywords(app): | |
db = pymongo.Connection(options.dbhost)[options.dbname] | |
col = db["keywords"] | |
amount = 3000 | |
rand = random.randint(0, col.count() - amount) | |
seek = col.find(skip=rand,limit=amount) | |
for s in seek: | |
app.static_keywords.append(s) | |
def main(): | |
tornado.options.parse_command_line() | |
app = Application() | |
load_random_keywords(app) | |
http_server = tornado.httpserver.HTTPServer(app) | |
http_server.listen(options.port) | |
try: | |
tornado.ioloop.IOLoop.instance().start() | |
except KeyboardInterrupt: | |
tornado.ioloop.IOLoop.instance().stop() | |
logging.info("server stoped") | |
if __name__ == '__main__': | |
main() | |
# vim: ts=4 sts=4 sw=4 si et | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment