Skip to content

Instantly share code, notes, and snippets.

@onetown
Created January 25, 2014 08:08
Show Gist options
  • Save onetown/8613321 to your computer and use it in GitHub Desktop.
Save onetown/8613321 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import os
import re
import logging
import random
import pymongo
import datetime
import tornado.httpserver
import tornado.options
import tornado.ioloop
import tornado.web
import tornado.httpclient
import tornado.escape
from tornado.httpclient import HTTPError
from tornado.options import define, options
from lxml import html as HTML
from lxml.html.soupparser import fromstring
from lxml.etree import tostring
from bson.objectid import ObjectId
import gutil
import ads
define("port", default=8000, help="run on the given port", type=int)
define("dbhost", default="localhost")
define("dbname", default="zzdocs")
define("debug", default=False)
class Application(tornado.web.Application):
def __init__(self):
handlers = [
(r"/", IndexHandler),
(r"/404",NotFound),
(r"/download",DownloadHandler),
(r"/d/([^/]+)",DownloadHandler),
(r"/search",SearchHandler),
(r"/s/([^/]+)", SearchHandler),
(r"/s/([^/]+)/([^/]+)", SearchHandler),
(r"/magazine",MagzineIndex),
(r"/magazine/([^/]+)",MagzineIndex),
(r"/journal/([^/]+)", JournalHandler),
(r"/magazine/category/([^/]+)",MagzineCategoryHandler),
(r"/dmca.html",DmcaHandler),
(r".*",NotFound),
]
settings = dict(
debug = options.debug,
template_path=os.path.join(os.path.dirname(__file__),"templates"),
static_path=os.path.join(os.path.dirname(__file__),"static"),
)
self._support_doctypes = [
("pdf","Acrobat PDF"),
("doc","Word 2003"),
("docx","Word 2007/2010"),
("xls", "Excel 2003"),
("xlsx", "Excel 2007/2010"),
("ppt","Powerpoint 2003"),
("pptx","Powerpoint 2007/2010"),
("pages","Apple pages"),
("numbers","Apple Numbers"),
("keynote","Apple Keynote")
]
self._bots = [
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"Mediapartners-Google",
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
"Crawler",
"Baiduspider+(+http://www.baidu.com/search/spider.htm)",
"Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)",
"Python-urllib/2.4",
"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Sosospider+(+http://help.soso.com/webspider.htm)",
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
]
tornado.web.Application.__init__(self, handlers, **settings)
self.static_keywords = []
@property
def support_doctypes(self):
return self._support_doctypes
@property
def bots(self):
return self._bots
class BaseHandler(tornado.web.RequestHandler):
def get_connection(self):
conn = pymongo.Connection(options.dbhost)
return conn
def _connection(self):
return self.get_connection()[options.dbname]
def _userdb(self):
return self.get_connection()["userdb"]
def prepare(self):
self.keyword = ""
self.title = ""
self.metakeyword = ""
self.doctype = "pdf"
self.description = None
@property
def magzine_categories(self):
if not hasattr(self,'_magcates'):
self._magcates = []
db = self._connection()['mag_categories']
cates = db.find().sort("total",pymongo.DESCENDING)
for cate in cates:
self._magcates.append((cate['name'],cate["total"],str(cate['_id']) ))
return self._magcates
def get_current_user(self):
return None
def get_error_html(self,status_code, **kwargs):
print "404"
self.render("404.html")
def get_random_ad(self, size="300_250"):
adstr = None
ad = []
if size == "300_250":
ad = ads.ad_300_250
if len(ad) > 0:
adstr = ad[random.randint(0,len(ad)-1)]
return adstr
class IndexHandler(BaseHandler):
def get(self):
rand_keywords = []
for i in range(18):
rand_keywords.append(self.application.static_keywords[random.randint(0,len(self.application.static_keywords)-1)])
self.render("index.html",rand_keywords = rand_keywords)
class DmcaHandler(BaseHandler):
def get(self):
self.render("dmca.html")
class NotFound(BaseHandler):
def get(self):
self.render("404.html")
class DownloadHandler(BaseHandler):
def get(self,id=None):
tid = ""
if not id:
tid = self.get_argument("id","")
else:
tid = id
item = gutil.getItem(tid, self._connection())
if not item:
'''raise http 404'''
raise HTTPError(404)
self.title = item["keyword"]
self.keyword = item["keyword"]
self.metakeyword = self.keyword
self.description = item["despstr"]
mk = []
for a in item["relationkw"]:
for c in a:
el = HTML.fromstring(c)
mk.append(el.text_content())
self.metakeyword += ",".join(mk)
self.render("download.html", item=item)
def post(self,id=None):
if not id:
raise HTTPError(404)
item = gutil.getItem(id,self._connection())
if not item:
raise HTTPError(404)
name = self.get_argument("name","")
email = self.get_argument("email","")
message = tornado.escape.xhtml_escape(self.get_argument("message",""))
comment = dict(
id = id,
name = name,
email = email,
message = message,
date = datetime.datetime.utcnow()
)
fcol = self._connection()["documents"]
item["comments"].append(comment)
fcol.save(item)
self._userdb()["comments"].insert(comment)
rtn = dict(result="ok")
self.write(tornado.escape.json_encode(rtn))
self.finish()
class SearchHandler(BaseHandler):
@tornado.web.asynchronous
def get(self,k=None,pageno=1):
self.pageno = 1
self.doctype = self.get_argument("dt","pdf")
try:
self.pageno = int(pageno)
except:
pass
tk = self.get_argument("q","").replace('/','+')
if tk <> "":
self.redirect("/s/" + tk)
return
'''get search url'''
if not k or k=="":
#feel lucky
k = "test" #will get a random keyword
k = k.replace('/','+')
self.keyword = k
self.title = k
start = (self.pageno - 1) * 10
gurl = gutil.getGUrl(k,t=self.doctype,start=start)
req = tornado.httpclient.HTTPRequest(gurl)
req.headers = tornado.httputil.HTTPHeaders(req.headers)
ua = self.request.headers["User-Agent"]
for agent in self.application.bots:
if agent == ua:
ua = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C; .NET4.0E)"
req.headers["User-Agent"] = ua
ci = tornado.httpclient.AsyncHTTPClient()
ci.fetch(req, self._onload)
def _onload(self,resp):
body = ""
try:
body = resp.body.decode("utf-8",'ignore')
except:
body = resp.body
page = gutil.parse_sr(body,kw=self.keyword, pn=self.pageno, db=self._connection())
#self.write(HTML.tostring(doc,encoding="utf-8"))
#self.finish()
self.metakeyword = page["keywords"]
self.render("search.html",page=page)
class MagzineIndex(BaseHandler):
def get(self,magid=None):
filter = {}
limit = 15
title = "The latest magzines"
category = None
if magid:
try:
mid = ObjectId(magid)
filter["magid"] = mid
limit = 0
except:
pass
topbooks = self._connection()['books'].find(filter).sort("datetime",pymongo.DESCENDING).limit(limit)
if filter.has_key('magid'):
mag = self._connection()['magzines'].find_one({'_id':filter['magid']})
if mag:
category = mag['category']
books = []
for tb in topbooks:
books.append(tb)
self.render("magzine.html", books= books, title=title, category=category)
class JournalHandler(BaseHandler):
def get(self, id=None):
if id:
try:
oid = None
if id.find('#') > -1:
od = id.split('#')[0]
else:
oid = id
book = self._connection().books.find_one({'_id': ObjectId(oid)})
if not book.has_key("comments") or book["comments"] == None:
book["comments"] = []
self.render("journal.html",item=book)
except Exception,e:
logging.error(e)
raise HTTPError(404)
else:
self.redirect("/magzine")
def post(self,id=None):
if not id:
raise HTTPError(404)
journal = None
try:
journal = self._connection().books.find_one({"_id":ObjectId(id)})
except:
raise HTTPError(500)
if not journal:
raise HTTPError(404)
name = self.get_argument("name","")
email = self.get_argument("email","")
message = tornado.escape.xhtml_escape(self.get_argument("message",""))
comment = dict(
id = id,
name = name,
email = email,
message = message,
date = datetime.datetime.utcnow()
)
if not journal.has_key("comments"):
journal["comments"] = []
journal["comments"].append(comment)
self._connection().books.save(journal)
self._userdb()["comments"].insert(comment)
rtn = dict(result="ok")
self.write(tornado.escape.json_encode(rtn))
self.finish()
class MagzineCategoryHandler(BaseHandler):
def get(self, name=None):
filter = {}
categoryname = ""
if name:
categoryname = name
filter = {'category':name}
else:
categoryname = self.locale.translate("Magzines")
magcol = self._connection().magzines.find(filter, limit=24)
mags = []
for m in magcol:
newj = self._connection().books.find_one({'magid':m['_id']},sort=[("datetime",pymongo.DESCENDING)]) #'coverimg.remote':{'$ne':''}})
if not newj:
logging.error("dont find magzines in id %s", str(m['_id']))
pass
else:
m['new'] = newj
mags.append(m)
self.render("category.html", magzines = mags, catename=categoryname)
def load_random_keywords(app):
db = pymongo.Connection(options.dbhost)[options.dbname]
col = db["keywords"]
amount = 3000
rand = random.randint(0, col.count() - amount)
seek = col.find(skip=rand,limit=amount)
for s in seek:
app.static_keywords.append(s)
def main():
tornado.options.parse_command_line()
app = Application()
load_random_keywords(app)
http_server = tornado.httpserver.HTTPServer(app)
http_server.listen(options.port)
try:
tornado.ioloop.IOLoop.instance().start()
except KeyboardInterrupt:
tornado.ioloop.IOLoop.instance().stop()
logging.info("server stoped")
if __name__ == '__main__':
main()
# vim: ts=4 sts=4 sw=4 si et
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment