Skip to content

Instantly share code, notes, and snippets.

@arook
Last active October 13, 2015 17:18
Show Gist options
  • Save arook/4230062 to your computer and use it in GitHub Desktop.
Save arook/4230062 to your computer and use it in GitHub Desktop.
Monitor Client
[redis]
host = ''
[node]
site_id = 1
bind_ip = ''
import httplib, re, os, urllib
HOST = 'www.greebid.com'
CATEGORY = [('/wholesale/original-launch-x431-auto-professional-scanner/', 'Original Launch X431 Auto Professional Scanner'),
('/wholesale/wholesale-obd-tool/', 'Wholesale OBD Tool'),
('/wholesale/auto-diagnostic-scanner/', 'Auto Diagnostic Scanner'),
('/wholesale/vag-diagnostic-tool/', 'VAG Diagnostic Tool'),
('/wholesale/heavy-duty-truck-diagnosis/', 'Heavy Duty Truck Diagnosis'),
('/wholesale/car-key-programmer/', 'Car Key programmer'),
('/wholesale/obd-code-reader/', 'OBD Code Reader'),
('/wholesale/ecu-programmer-tool/', 'ECU Programmer tool'),
('/wholesale/auto-diagnostic-software/', 'Auto Diagnostic software'),
('/wholesale/odometer-correction-tools/', 'Odometer Correction Tools'),
('/wholesale/airbag-reset-tools/', 'Airbag Reset tools'),
('/wholesale/find-an-auto-key-tool/', 'Find an Auto Key Tool'),
('/wholesale/other-obd2-tools/', 'Other OBD2 Tools'),
('/wholesale/obd-convert-connector-obd1-to-obd2-/', 'OBD Convert Connector ( OBD1 to OBD2 )'),
('/wholesale/auto-hid-xenon-light/', 'Auto HID Xenon Light'),
('/wholesale/mp3-car-adapter/', 'MP3 Car Adapter'),
('/wholesale/car-led-light/', 'Car LED Light'),
('/wholesale/car-dvd-players/', 'Car DVD Players'),
('/wholesale/hot-car-accessories/', 'Hot Car Accessories'),
('/wholesale/automotive-electrical-testers-test-leads/', 'Automotive Electrical Testers & Test Leads'),
('/wholesale/car-power-inverter/', 'Car Power Inverter')]
class Robot(object):
def __init__(self, catogory):
super(Robot, self).__init__()
(self.url, self.catogory) = catogory
self.headers = {'User-agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'}
def decode_items(self, page_num = None):
list = []
if page_num == None:
resp = self.__fetch(self.url)
else:
resp = self.__fetch(self.url + 'index_' + str(page_num) + '.html')
html = resp.read()
matches = re.compile('''<a href="([^"]+)" title="([^"]+)"><img src="([^"]+)" width="120" height="120" border="0" hspace="0" vspace="0" alt="([^"]+)" align="absmiddle" /></a>''', re.I | re.M | re.S).findall(html)
for item in matches:
list = list + [(item[0], item[1], item[2])]
if page_num == None:
pages = re.compile('''<div class="page_nav"><b>([\d]+)</b> Results, Page''', re.I | re.M | re.S).findall(html)
pn = int(pages[0])
if pn > 20:
for i in range(2, int(pn/20)+1):
print 'fetching page[', i, ']'
list = list + self.decode_items(i)
return list
def decode_product(self, url):
resp = self.__fetch(url)
html = resp.read()
matches = re.compile('''<div class="brief">.*<h1>(.*?)</h1>.*?<div class="pro_c_b_itemno">Item No. (.*?)</div>.*?<img src="/images/ico_currency/USD.gif" border="0" align="absmiddle" hspace="5">USD&nbsp;([^<]+)</a>.*?<dd class="w420px">.*?([\d]+)g</dd>.*?<dd class="w420px">(.*?) </dd>.*?<div id="p_ab_vw_1" style="display:">(.*?)<div class="blank10px"></div>\r\n<strong>Customer Service: </strong>''', re.I | re.M | re.S).findall(html)
imgs = re.compile('''<img src="([^"]+)" width="45" height="45" border="0" hspace="0" vspace="0" alt="[^"]+" align="absmiddle" />''', re.I | re.M | re.S).findall(html)
return (matches, imgs)
def __fetch(self, url):
conn = httplib.HTTPConnection(HOST)
conn.request('GET', url, None, self.headers)
return conn.getresponse()
def main():
result = {}
for item in CATEGORY:
print 'begin fetch ', item[1]
robot = Robot(item)
result[item[1]] = robot.decode_items()
print 'end fetch ', item[1]
open('greebid/' + item[1] + '.txt', 'w+').write(str(result[item[1]]))
def main1():
"""fetching item 1"""
for item in CATEGORY:
list = eval(open('greebid/' + item[1] + '.txt', 'r').read())
if not os.path.isdir('greebid/' + item[1]):
os.mkdir('greebid/' + item[1])
for p in list:
robot = Robot(item)
row = robot.decode_product(p[0])
(name, price, no, weigh, det, desc) = row[0][0]
data = """ %s, %s, %s, %s, %s, %s, """ % (name, price, no, weigh, det, desc.replace('"', '\"').replace('\r\n', '').replace(',', '.'))
paths = []
for img in row[1]:
url = 'http://' + HOST + img
path = 'greebid/' + item[1] + '/' + img.split('/')[-1]
paths.append(path)
open(path, 'wb').write(urllib.urlopen(url).read())
# save
data = data + '|'.join(paths) + '\n'
open('greebid/' + item[1] + '.csv', 'a+').write(data)
if __name__ == '__main__':
main1()
#!/usr/bin/evn python
import redis, thread, httplib, socket, ConfigParser
import re, json
import sys, os, time, atexit
from signal import SIGTERM
HOST = "www.amazon.com"
BUYBOX = "/Notebook-Adapter-Battery-compatible-Inspiron/dp/%s/ref=sr_1_1?ie=UTF8&s=electronics&qid=1283499480&sr=8-1"
# BUYBOX_REG = '''<b class="priceLarge">\$([^<>]+)<\/b>.*?(?:Ships from and sold by <b><a href="[^"]+">(.*?)<\/a>.*?)?(?:Sold by <b><a href="[^"]+">(.*?)<\/a><\/b> and.*?)?(?:<a href="[^"]+" id="[^"]+"><strong>(.*?)<\/strong>.*?)?<\/[b|a
BUYBOX_REG = '''<b class="priceLarge">\$([^<>]+)<\/b>.*?<div class="buying" style="padding-bottom: 0.75em;">.*?(?:Ships from and sold by <b><a href=".*seller\=([\d\w]+)">.*<\/a>.*?)?(?:Sold by <b><a href=".*seller\=([\d\w]+)">.*<\/a><\/b> and.*?)?(?:<a href="[^"]+" id="[^"]+"><strong>(.*?)<\/strong>.*?)?<\/[b|a]?>\.'''
LISTING = "/gp/offer-listing/%s/sr=/qid=/ref=olp_tab_new?ie=UTF8&coliid=&me=&qid=&sr=&seller=&colid=&condition=new"
# LISTING_REG = '''<tbody class="result">.*?(?:<span class="price">\$([\d\.]+)<\/span>.*?)?(?:<span class="price_shipping">\+ \$([^<>]*)<\/span>.*?)?(?:<a href="http\:\/\/www\.amazon\.com\/shops\/([A-Z0-9]+)\/[^"]+">.*?)?(?:<a href="\/gp\/aag\/main\/ref.*?seller=([A-Z0-9]+)">.*?)?(?:<img src="([^"]+)" width="120" alt="([\w\s\-\,\.\&]+)?".*?)?(?:<b>([\w\s\-\,\.\&]*)<\/b><\/a>.*?)?(?:(Fulfillment) by Amazon.*?)?<\/tbody>'''
LISTING_REG = '''<span class='a-size-large a-color-price olpOfferPrice a-text-bold'>.*?\$([\d\.]+).*?<\/span>.*?(?:<span class="olpShippingPrice">\$([\d\.]+)<\/span>.*?)?(?:<a href="http\:\/\/www\.amazon\.com\/shops\/([A-Z0-9]+)\/[^"]+">.*?)?(?:<a href="\/gp\/help\/seller\/at-a-glance\.html.*?seller=([A-Z0-9]+)">.*?)?(?:<img src="([^"]+)" width="120" alt="([\w\s\-\,\.]+)?".*?)?(?:<b>([\w\s\-\,\.]*)<\/b><\/a>.*?)?(?:(Fulfillment) by Amazon.*?)?<p class='a-spacing-none a-text-center olpSignIn'>'''
class Robot(object):
def __init__(self, r, bind_ip, bind_port):
super(Robot, self).__init__()
self.r, self.ip, self.port = r, bind_ip, bind_port
self.headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
def analyze(self, asin, fid, nid):
"""docstring for analyze"""
status, msg, content = 200, 'OK', []
# fetching buybox page
try: resp = self.__fetch(BUYBOX % asin)
except Exception, e:
self.postback(fid, nid, 50, str(e), content)
return
else:
if resp.status != 200:
self.postback(fid, nid, resp.status, resp.reason, content)
return
buybox_html = resp.read()
# fetching listing page
try: resp = self.__fetch(LISTING % asin)
except Exception, e:
self.postback(fid, nid, 50, str(e), content)
return
else:
if resp.status != 200:
self.postback(fid, nid, resp.status, resp.reason, content)
return
listing_html = resp.read()
# parse HTML Code
try: listing_rtn = re.compile(LISTING_REG, re.I | re.M | re.S).findall(listing_html)
except Exception, e:
self.postback(fid, nid, r, 51, e, content)
return
else:
key = 0
for item in listing_rtn:
key = key + 1
current = {
'rank': key,
'sell_price': 100 * (float(item[0]) if '.' in item[0] else (int(item[1]) if item[1] != '' else 0)),
'shipping_price': 100 * (float(item[1]) if '.' in item[1] else (int(item[1]) if item[1] != '' else 0)),
'sid': item[2] + item[3],
'avatar': item[4],
'seller': item[5] + item[6],
'if_fba': item[7] == 'Fulfillment',
'if_buybox' : False
}
content.append(current)
try: buybox_rtn = re.compile(BUYBOX_REG, re.I | re.M | re.S).findall(buybox_html)
except Exception, e:
self.postback(fid, nid, r, 52, e, content)
return
else:
if len(buybox_rtn) == 0:
status, msg = 52, 'analyze buybox error'
else:
buybox_rtn = buybox_rtn[0]
price, sid, if_fba = buybox_rtn[0], buybox_rtn[1] or buybox_rtn[2], buybox_rtn[3] == 'Fulfilled by Amazon'
for current in content:
if current['sell_price'] == 100 * float(price) and current['sid'] == sid and current['if_fba'] == if_fba:
current['if_buybox'] = True
postbacked = self.postback(fid, nid, status, msg, content)
print asin, fid, nid, postbacked, status, msg
pass
def postback(self, fid, nid, status, msg, content):
"""
ERROR Code
50 Programming Level Fetching Error
51 Programming Level Parseing Error[Listing]
52 Programming Level Parseing Error[Buybox]
Normal HTTP Code
404 NotFound
503
201 Buybox Analyze Failed
"""
# print 'post back...', status, msg, content
return self.r.publish('monitor_center', json.dumps({'fid': fid, 'status': status, 'msg': msg, 'result': content, 'client': nid}))
def __fetch(self, url):
conn = httplib.HTTPConnection(HOST, source_address = (self.ip, self.port))
conn.request('GET', url, None, self.headers)
return conn.getresponse()
class Daemon:
"""
A generic daemon class.
Usage: subclass the Daemon class and override the run() method
"""
def __init__(self, pidfile, stdin='/dev/null', stdout='/dev/null', stderr='/dev/null'):
self.stdin = stdin
self.stdout = stdout
self.stderr = stderr
self.pidfile = pidfile
def daemonize(self):
"""
do the UNIX double-fork magic, see Stevens' "Advanced
Programming in the UNIX Environment" for details (ISBN 0201563177)
http://www.erlenstar.demon.co.uk/unix/faq_2.html#SEC16
"""
try:
pid = os.fork()
if pid > 0:
# exit first parent
sys.exit(0)
except OSError, e:
sys.stderr.write("fork #1 failed: %d (%s)\n" % (e.errno, e.strerror))
sys.exit(1)
# decouple from parent environment
os.chdir("/")
os.setsid()
os.umask(0)
# do second fork
try:
pid = os.fork()
if pid > 0:
# exit from second parent
sys.exit(0)
except OSError, e:
sys.stderr.write("fork #2 failed: %d (%s)\n" % (e.errno, e.strerror))
sys.exit(1)
# redirect standard file descriptors
sys.stdout.flush()
sys.stderr.flush()
si = file(self.stdin, 'r')
so = file(self.stdout, 'a+')
se = file(self.stderr, 'a+', 0)
os.dup2(si.fileno(), sys.stdin.fileno())
os.dup2(so.fileno(), sys.stdout.fileno())
os.dup2(se.fileno(), sys.stderr.fileno())
# write pidfile
atexit.register(self.delpid)
pid = str(os.getpid())
file(self.pidfile,'w+').write("%s\n" % pid)
def delpid(self):
os.remove(self.pidfile)
def start(self):
"""
Start the daemon
"""
# Check for a pidfile to see if the daemon already runs
try:
pf = file(self.pidfile,'r')
pid = int(pf.read().strip())
pf.close()
except IOError:
pid = None
if pid:
message = "pidfile %s already exist. Daemon already running?\n"
sys.stderr.write(message % self.pidfile)
sys.exit(1)
# Start the daemon
self.daemonize()
self.run()
def stop(self):
"""
Stop the daemon
"""
# Get the pid from the pidfile
try:
pf = file(self.pidfile,'r')
pid = int(pf.read().strip())
pf.close()
except IOError:
pid = None
if not pid:
message = "pidfile %s does not exist. Daemon not running?\n"
sys.stderr.write(message % self.pidfile)
return # not an error in a restart
# Try killing the daemon process
try:
while 1:
os.kill(pid, SIGTERM)
time.sleep(0.1)
except OSError, err:
err = str(err)
if err.find("No such process") > 0:
if os.path.exists(self.pidfile):
os.remove(self.pidfile)
else:
print str(err)
sys.exit(1)
def restart(self):
"""
Restart the daemon
"""
self.stop()
self.start()
def run(self):
"""
You should override this method when you subclass Daemon. It will be called after the process has been
daemonized by start() or restart().
"""
class MyDaemon(Daemon):
def set_config_file(self, config_file):
self.config_file = config_file
def run(self):
"""docstring for main"""
config = ConfigParser.ConfigParser()
config.read(self.config_file)
host, nid, ip = config.get('redis', 'host'), config.get('node', 'site_id'), config.get('node', 'bind_ip')
pool = redis.ConnectionPool(host = host)
r = redis.Redis(connection_pool = pool)
pubsub = r.pubsub()
pubsub.subscribe('monitor_%s' % nid)
robot = Robot(r, ip, 0)
while True:
for m in pubsub.listen():
if m['type'] == 'message':
asin, fid = m['data'].split(',')
thread.start_new_thread(robot.analyze, (asin, fid, nid))
def main():
path = os.path.realpath(sys.argv[1])
if len(sys.argv) == 3:
daemon = MyDaemon('/tmp/daemon-' + path.replace('/', '_') + '.pid')
daemon.set_config_file(path)
if 'start' == sys.argv[2]:
daemon.start()
elif 'stop' == sys.argv[2]:
daemon.stop()
elif 'restart' == sys.argv[2]:
daemon.restart()
else:
print "Unknown command"
sys.exit(2)
sys.exit(0)
else:
print "usage: %s config_file start|stop|restart" % sys.argv[0]
sys.exit(2)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment