Skip to content

Instantly share code, notes, and snippets.

@Wizmann
Created March 2, 2013 15:26
Show Gist options
  • Save Wizmann/5071535 to your computer and use it in GitHub Desktop.
Save Wizmann/5071535 to your computer and use it in GitHub Desktop.
Python XML Parser with gevent
# coding=utf-8
import sys
import xml.etree.cElementTree as cElementTree
from pyquery import PyQuery
import time
import gevent
from gevent.queue import Queue, Empty
reload(sys)
sys.setdefaultencoding('utf-8')
WORKERS = 19
start_time = time.time()
q = Queue(maxsize=12800)
PATH = (
('offer title', 'title', None, None),
('offer image_url', 'img_url', None, None),
('offer url', 'url', None, None),
('offer publish_cities city', 'city', "List",
lambda x: '全国' if len(x) > 1 else x[0]),
('offer start_timestamp', 'begintime', None, None),
('offer end_timestamp', 'endtime', None, None),
('offer original_price', 'value', None, None),
('offer current_price', 'price', None, None),
('offer sales_number', 'curnumber', None, None),
('offer shops shop_info shop_name', "shop_name", "List",
lambda x: ','.join(x)),
('offer shops shop_info shop_address', "shop_address", "List",
lambda x: ','.join(x)),
('offer shops shop_info shop_phone', "shop_phone", "List",
lambda x: ','.join(x)),
('offer shops shop_info longitude', "long", "List", lambda x: ','.join(x)),
('offer shops shop_info latitude', "lat", "List", lambda x: ','.join(x)),
)
def boss():
infile = 'test/lashou_global.xml'
context = cElementTree.iterparse(infile, events=('end',))
context = iter(context)
event, root = context.next()
for event, elem in context:
if elem.tag == "offer":
node = cElementTree.tostring(elem)
q.put(node)
root.clear()
print time.time() - start_time
def parse(xml):
d = PyQuery(xml, parser='xml')
res = dict()
for item in PATH:
path = item[0]
alias = item[1]
item_type = item[2]
dealer = item[3]
if item_type == 'List':
res[alias] = dealer([node.text for node in d(path)])
elif item_type is None:
res[alias] = d(path).text()
else:
res[alias] = None
print 'No type settings...'
return res
def worker(name):
print '%s is working...' % name
deal = 0
while True:
try:
task = q.get(timeout=3)
parse(task)
deal += 1
if deal % 100 == 0:
print '%s has dealed %d deals...' % (name, deal)
gevent.sleep(0)
except Empty:
print time.time() - start_time
time.sleep(1)
if __name__ == '__main__':
god = []
god.append(gevent.spawn(boss))
for i in xrange(WORKERS):
print i
ww = gevent.spawn(worker, "lashou(%d)" % i)
god.append(ww)
gevent.joinall(god)
print 'Done'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment