Last active
October 13, 2015 17:18
-
-
Save arook/4230062 to your computer and use it in GitHub Desktop.
Monitor Client
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[redis] | |
host = '' | |
[node] | |
site_id = 1 | |
bind_ip = '' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import httplib, re, os, urllib | |
HOST = 'www.greebid.com' | |
CATEGORY = [('/wholesale/original-launch-x431-auto-professional-scanner/', 'Original Launch X431 Auto Professional Scanner'), | |
('/wholesale/wholesale-obd-tool/', 'Wholesale OBD Tool'), | |
('/wholesale/auto-diagnostic-scanner/', 'Auto Diagnostic Scanner'), | |
('/wholesale/vag-diagnostic-tool/', 'VAG Diagnostic Tool'), | |
('/wholesale/heavy-duty-truck-diagnosis/', 'Heavy Duty Truck Diagnosis'), | |
('/wholesale/car-key-programmer/', 'Car Key programmer'), | |
('/wholesale/obd-code-reader/', 'OBD Code Reader'), | |
('/wholesale/ecu-programmer-tool/', 'ECU Programmer tool'), | |
('/wholesale/auto-diagnostic-software/', 'Auto Diagnostic software'), | |
('/wholesale/odometer-correction-tools/', 'Odometer Correction Tools'), | |
('/wholesale/airbag-reset-tools/', 'Airbag Reset tools'), | |
('/wholesale/find-an-auto-key-tool/', 'Find an Auto Key Tool'), | |
('/wholesale/other-obd2-tools/', 'Other OBD2 Tools'), | |
('/wholesale/obd-convert-connector-obd1-to-obd2-/', 'OBD Convert Connector ( OBD1 to OBD2 )'), | |
('/wholesale/auto-hid-xenon-light/', 'Auto HID Xenon Light'), | |
('/wholesale/mp3-car-adapter/', 'MP3 Car Adapter'), | |
('/wholesale/car-led-light/', 'Car LED Light'), | |
('/wholesale/car-dvd-players/', 'Car DVD Players'), | |
('/wholesale/hot-car-accessories/', 'Hot Car Accessories'), | |
('/wholesale/automotive-electrical-testers-test-leads/', 'Automotive Electrical Testers & Test Leads'), | |
('/wholesale/car-power-inverter/', 'Car Power Inverter')] | |
class Robot(object): | |
def __init__(self, catogory): | |
super(Robot, self).__init__() | |
(self.url, self.catogory) = catogory | |
self.headers = {'User-agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'} | |
def decode_items(self, page_num = None): | |
list = [] | |
if page_num == None: | |
resp = self.__fetch(self.url) | |
else: | |
resp = self.__fetch(self.url + 'index_' + str(page_num) + '.html') | |
html = resp.read() | |
matches = re.compile('''<a href="([^"]+)" title="([^"]+)"><img src="([^"]+)" width="120" height="120" border="0" hspace="0" vspace="0" alt="([^"]+)" align="absmiddle" /></a>''', re.I | re.M | re.S).findall(html) | |
for item in matches: | |
list = list + [(item[0], item[1], item[2])] | |
if page_num == None: | |
pages = re.compile('''<div class="page_nav"><b>([\d]+)</b> Results, Page''', re.I | re.M | re.S).findall(html) | |
pn = int(pages[0]) | |
if pn > 20: | |
for i in range(2, int(pn/20)+1): | |
print 'fetching page[', i, ']' | |
list = list + self.decode_items(i) | |
return list | |
def decode_product(self, url): | |
resp = self.__fetch(url) | |
html = resp.read() | |
matches = re.compile('''<div class="brief">.*<h1>(.*?)</h1>.*?<div class="pro_c_b_itemno">Item No. (.*?)</div>.*?<img src="/images/ico_currency/USD.gif" border="0" align="absmiddle" hspace="5">USD ([^<]+)</a>.*?<dd class="w420px">.*?([\d]+)g</dd>.*?<dd class="w420px">(.*?) </dd>.*?<div id="p_ab_vw_1" style="display:">(.*?)<div class="blank10px"></div>\r\n<strong>Customer Service: </strong>''', re.I | re.M | re.S).findall(html) | |
imgs = re.compile('''<img src="([^"]+)" width="45" height="45" border="0" hspace="0" vspace="0" alt="[^"]+" align="absmiddle" />''', re.I | re.M | re.S).findall(html) | |
return (matches, imgs) | |
def __fetch(self, url): | |
conn = httplib.HTTPConnection(HOST) | |
conn.request('GET', url, None, self.headers) | |
return conn.getresponse() | |
def main(): | |
result = {} | |
for item in CATEGORY: | |
print 'begin fetch ', item[1] | |
robot = Robot(item) | |
result[item[1]] = robot.decode_items() | |
print 'end fetch ', item[1] | |
open('greebid/' + item[1] + '.txt', 'w+').write(str(result[item[1]])) | |
def main1(): | |
"""fetching item 1""" | |
for item in CATEGORY: | |
list = eval(open('greebid/' + item[1] + '.txt', 'r').read()) | |
if not os.path.isdir('greebid/' + item[1]): | |
os.mkdir('greebid/' + item[1]) | |
for p in list: | |
robot = Robot(item) | |
row = robot.decode_product(p[0]) | |
(name, price, no, weigh, det, desc) = row[0][0] | |
data = """ %s, %s, %s, %s, %s, %s, """ % (name, price, no, weigh, det, desc.replace('"', '\"').replace('\r\n', '').replace(',', '.')) | |
paths = [] | |
for img in row[1]: | |
url = 'http://' + HOST + img | |
path = 'greebid/' + item[1] + '/' + img.split('/')[-1] | |
paths.append(path) | |
open(path, 'wb').write(urllib.urlopen(url).read()) | |
# save | |
data = data + '|'.join(paths) + '\n' | |
open('greebid/' + item[1] + '.csv', 'a+').write(data) | |
if __name__ == '__main__': | |
main1() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/evn python | |
import redis, thread, httplib, socket, ConfigParser | |
import re, json | |
import sys, os, time, atexit | |
from signal import SIGTERM | |
HOST = "www.amazon.com" | |
BUYBOX = "/Notebook-Adapter-Battery-compatible-Inspiron/dp/%s/ref=sr_1_1?ie=UTF8&s=electronics&qid=1283499480&sr=8-1" | |
# BUYBOX_REG = '''<b class="priceLarge">\$([^<>]+)<\/b>.*?(?:Ships from and sold by <b><a href="[^"]+">(.*?)<\/a>.*?)?(?:Sold by <b><a href="[^"]+">(.*?)<\/a><\/b> and.*?)?(?:<a href="[^"]+" id="[^"]+"><strong>(.*?)<\/strong>.*?)?<\/[b|a | |
BUYBOX_REG = '''<b class="priceLarge">\$([^<>]+)<\/b>.*?<div class="buying" style="padding-bottom: 0.75em;">.*?(?:Ships from and sold by <b><a href=".*seller\=([\d\w]+)">.*<\/a>.*?)?(?:Sold by <b><a href=".*seller\=([\d\w]+)">.*<\/a><\/b> and.*?)?(?:<a href="[^"]+" id="[^"]+"><strong>(.*?)<\/strong>.*?)?<\/[b|a]?>\.''' | |
LISTING = "/gp/offer-listing/%s/sr=/qid=/ref=olp_tab_new?ie=UTF8&coliid=&me=&qid=&sr=&seller=&colid=&condition=new" | |
# LISTING_REG = '''<tbody class="result">.*?(?:<span class="price">\$([\d\.]+)<\/span>.*?)?(?:<span class="price_shipping">\+ \$([^<>]*)<\/span>.*?)?(?:<a href="http\:\/\/www\.amazon\.com\/shops\/([A-Z0-9]+)\/[^"]+">.*?)?(?:<a href="\/gp\/aag\/main\/ref.*?seller=([A-Z0-9]+)">.*?)?(?:<img src="([^"]+)" width="120" alt="([\w\s\-\,\.\&]+)?".*?)?(?:<b>([\w\s\-\,\.\&]*)<\/b><\/a>.*?)?(?:(Fulfillment) by Amazon.*?)?<\/tbody>''' | |
LISTING_REG = '''<span class='a-size-large a-color-price olpOfferPrice a-text-bold'>.*?\$([\d\.]+).*?<\/span>.*?(?:<span class="olpShippingPrice">\$([\d\.]+)<\/span>.*?)?(?:<a href="http\:\/\/www\.amazon\.com\/shops\/([A-Z0-9]+)\/[^"]+">.*?)?(?:<a href="\/gp\/help\/seller\/at-a-glance\.html.*?seller=([A-Z0-9]+)">.*?)?(?:<img src="([^"]+)" width="120" alt="([\w\s\-\,\.]+)?".*?)?(?:<b>([\w\s\-\,\.]*)<\/b><\/a>.*?)?(?:(Fulfillment) by Amazon.*?)?<p class='a-spacing-none a-text-center olpSignIn'>''' | |
class Robot(object): | |
def __init__(self, r, bind_ip, bind_port): | |
super(Robot, self).__init__() | |
self.r, self.ip, self.port = r, bind_ip, bind_port | |
self.headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} | |
def analyze(self, asin, fid, nid): | |
"""docstring for analyze""" | |
status, msg, content = 200, 'OK', [] | |
# fetching buybox page | |
try: resp = self.__fetch(BUYBOX % asin) | |
except Exception, e: | |
self.postback(fid, nid, 50, str(e), content) | |
return | |
else: | |
if resp.status != 200: | |
self.postback(fid, nid, resp.status, resp.reason, content) | |
return | |
buybox_html = resp.read() | |
# fetching listing page | |
try: resp = self.__fetch(LISTING % asin) | |
except Exception, e: | |
self.postback(fid, nid, 50, str(e), content) | |
return | |
else: | |
if resp.status != 200: | |
self.postback(fid, nid, resp.status, resp.reason, content) | |
return | |
listing_html = resp.read() | |
# parse HTML Code | |
try: listing_rtn = re.compile(LISTING_REG, re.I | re.M | re.S).findall(listing_html) | |
except Exception, e: | |
self.postback(fid, nid, r, 51, e, content) | |
return | |
else: | |
key = 0 | |
for item in listing_rtn: | |
key = key + 1 | |
current = { | |
'rank': key, | |
'sell_price': 100 * (float(item[0]) if '.' in item[0] else (int(item[1]) if item[1] != '' else 0)), | |
'shipping_price': 100 * (float(item[1]) if '.' in item[1] else (int(item[1]) if item[1] != '' else 0)), | |
'sid': item[2] + item[3], | |
'avatar': item[4], | |
'seller': item[5] + item[6], | |
'if_fba': item[7] == 'Fulfillment', | |
'if_buybox' : False | |
} | |
content.append(current) | |
try: buybox_rtn = re.compile(BUYBOX_REG, re.I | re.M | re.S).findall(buybox_html) | |
except Exception, e: | |
self.postback(fid, nid, r, 52, e, content) | |
return | |
else: | |
if len(buybox_rtn) == 0: | |
status, msg = 52, 'analyze buybox error' | |
else: | |
buybox_rtn = buybox_rtn[0] | |
price, sid, if_fba = buybox_rtn[0], buybox_rtn[1] or buybox_rtn[2], buybox_rtn[3] == 'Fulfilled by Amazon' | |
for current in content: | |
if current['sell_price'] == 100 * float(price) and current['sid'] == sid and current['if_fba'] == if_fba: | |
current['if_buybox'] = True | |
postbacked = self.postback(fid, nid, status, msg, content) | |
print asin, fid, nid, postbacked, status, msg | |
pass | |
def postback(self, fid, nid, status, msg, content): | |
""" | |
ERROR Code | |
50 Programming Level Fetching Error | |
51 Programming Level Parseing Error[Listing] | |
52 Programming Level Parseing Error[Buybox] | |
Normal HTTP Code | |
404 NotFound | |
503 | |
201 Buybox Analyze Failed | |
""" | |
# print 'post back...', status, msg, content | |
return self.r.publish('monitor_center', json.dumps({'fid': fid, 'status': status, 'msg': msg, 'result': content, 'client': nid})) | |
def __fetch(self, url): | |
conn = httplib.HTTPConnection(HOST, source_address = (self.ip, self.port)) | |
conn.request('GET', url, None, self.headers) | |
return conn.getresponse() | |
class Daemon: | |
""" | |
A generic daemon class. | |
Usage: subclass the Daemon class and override the run() method | |
""" | |
def __init__(self, pidfile, stdin='/dev/null', stdout='/dev/null', stderr='/dev/null'): | |
self.stdin = stdin | |
self.stdout = stdout | |
self.stderr = stderr | |
self.pidfile = pidfile | |
def daemonize(self): | |
""" | |
do the UNIX double-fork magic, see Stevens' "Advanced | |
Programming in the UNIX Environment" for details (ISBN 0201563177) | |
http://www.erlenstar.demon.co.uk/unix/faq_2.html#SEC16 | |
""" | |
try: | |
pid = os.fork() | |
if pid > 0: | |
# exit first parent | |
sys.exit(0) | |
except OSError, e: | |
sys.stderr.write("fork #1 failed: %d (%s)\n" % (e.errno, e.strerror)) | |
sys.exit(1) | |
# decouple from parent environment | |
os.chdir("/") | |
os.setsid() | |
os.umask(0) | |
# do second fork | |
try: | |
pid = os.fork() | |
if pid > 0: | |
# exit from second parent | |
sys.exit(0) | |
except OSError, e: | |
sys.stderr.write("fork #2 failed: %d (%s)\n" % (e.errno, e.strerror)) | |
sys.exit(1) | |
# redirect standard file descriptors | |
sys.stdout.flush() | |
sys.stderr.flush() | |
si = file(self.stdin, 'r') | |
so = file(self.stdout, 'a+') | |
se = file(self.stderr, 'a+', 0) | |
os.dup2(si.fileno(), sys.stdin.fileno()) | |
os.dup2(so.fileno(), sys.stdout.fileno()) | |
os.dup2(se.fileno(), sys.stderr.fileno()) | |
# write pidfile | |
atexit.register(self.delpid) | |
pid = str(os.getpid()) | |
file(self.pidfile,'w+').write("%s\n" % pid) | |
def delpid(self): | |
os.remove(self.pidfile) | |
def start(self): | |
""" | |
Start the daemon | |
""" | |
# Check for a pidfile to see if the daemon already runs | |
try: | |
pf = file(self.pidfile,'r') | |
pid = int(pf.read().strip()) | |
pf.close() | |
except IOError: | |
pid = None | |
if pid: | |
message = "pidfile %s already exist. Daemon already running?\n" | |
sys.stderr.write(message % self.pidfile) | |
sys.exit(1) | |
# Start the daemon | |
self.daemonize() | |
self.run() | |
def stop(self): | |
""" | |
Stop the daemon | |
""" | |
# Get the pid from the pidfile | |
try: | |
pf = file(self.pidfile,'r') | |
pid = int(pf.read().strip()) | |
pf.close() | |
except IOError: | |
pid = None | |
if not pid: | |
message = "pidfile %s does not exist. Daemon not running?\n" | |
sys.stderr.write(message % self.pidfile) | |
return # not an error in a restart | |
# Try killing the daemon process | |
try: | |
while 1: | |
os.kill(pid, SIGTERM) | |
time.sleep(0.1) | |
except OSError, err: | |
err = str(err) | |
if err.find("No such process") > 0: | |
if os.path.exists(self.pidfile): | |
os.remove(self.pidfile) | |
else: | |
print str(err) | |
sys.exit(1) | |
def restart(self): | |
""" | |
Restart the daemon | |
""" | |
self.stop() | |
self.start() | |
def run(self): | |
""" | |
You should override this method when you subclass Daemon. It will be called after the process has been | |
daemonized by start() or restart(). | |
""" | |
class MyDaemon(Daemon): | |
def set_config_file(self, config_file): | |
self.config_file = config_file | |
def run(self): | |
"""docstring for main""" | |
config = ConfigParser.ConfigParser() | |
config.read(self.config_file) | |
host, nid, ip = config.get('redis', 'host'), config.get('node', 'site_id'), config.get('node', 'bind_ip') | |
pool = redis.ConnectionPool(host = host) | |
r = redis.Redis(connection_pool = pool) | |
pubsub = r.pubsub() | |
pubsub.subscribe('monitor_%s' % nid) | |
robot = Robot(r, ip, 0) | |
while True: | |
for m in pubsub.listen(): | |
if m['type'] == 'message': | |
asin, fid = m['data'].split(',') | |
thread.start_new_thread(robot.analyze, (asin, fid, nid)) | |
def main(): | |
path = os.path.realpath(sys.argv[1]) | |
if len(sys.argv) == 3: | |
daemon = MyDaemon('/tmp/daemon-' + path.replace('/', '_') + '.pid') | |
daemon.set_config_file(path) | |
if 'start' == sys.argv[2]: | |
daemon.start() | |
elif 'stop' == sys.argv[2]: | |
daemon.stop() | |
elif 'restart' == sys.argv[2]: | |
daemon.restart() | |
else: | |
print "Unknown command" | |
sys.exit(2) | |
sys.exit(0) | |
else: | |
print "usage: %s config_file start|stop|restart" % sys.argv[0] | |
sys.exit(2) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment