Skip to content

Instantly share code, notes, and snippets.

@coline-carle
Last active September 25, 2017 16:23
Show Gist options
  • Save coline-carle/e725c90b213e8ffbe8a133f66fb2ec93 to your computer and use it in GitHub Desktop.
Save coline-carle/e725c90b213e8ffbe8a133f66fb2ec93 to your computer and use it in GitHub Desktop.
certifi==2017.7.27.1
chardet==3.0.4
idna==2.6
lxml==3.8.0
python-dateutil==2.6.1
requests==2.18.4
six==1.10.0
SQLAlchemy==1.1.14
urllib3==1.22
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import sys
import requests
from lxml import etree
import dateutil.parser
from db import Page, Session
class SitemapSpider(object):
sitemap = 'http://www.wowhead.com/sitemap'
npc_regexp = r'npc'
def __init__(self, object, dbsession):
self.__dbsession = dbsession
self.__object = object
def crawl(self):
print("crawling : {}".format(self.sitemap))
response = requests.get(self.sitemap)
xmlp = etree.XMLParser(recover=True,
remove_comments=True,
resolve_entities=False)
self._root = etree.fromstring(response.content, parser=xmlp)
self.__iter__()
def __iter__(self):
# get all sub sitemaps
sub_sitemaps = []
for elem in self._root.getchildren():
for el in elem.getchildren():
tag = el.tag
name = tag.split('}', 1)[1] if '}' in tag else tag
if name == 'loc':
sub_sitemaps.append(el.text.strip()) if el.text else ''
# filter out sitemaps we need
object_sitemaps = filter(lambda o: self.__object in o, sub_sitemaps)
object_spider = SubSitemapSpider(self.__object, self.__dbsession)
for object_sitemap in object_sitemaps:
object_spider.crawl(object_sitemap)
class SubSitemapSpider(object):
def __init__(self, type, dbsession):
regexp = r'\=(?P<id>\d{1,7})'
self.__regexp = re.compile(regexp)
self.__type = type
self.__dbsession = dbsession
def crawl(self, url):
print("crawling : {}".format(url))
response = requests.get(url)
xmlp = etree.XMLParser(recover=True,
remove_comments=True,
resolve_entities=False)
self._root = etree.fromstring(response.content, parser=xmlp)
self.__iter__()
def __iter__(self):
for elem in self._root.getchildren():
page = Page()
page.type = self.__type
for el in elem.getchildren():
tag = el.tag
name = tag.split('}', 1)[1] if '}' in tag else tag
if name == 'loc':
if el.text:
page.loc = el.text.strip()
matchID = self.__regexp.search(page.loc)
page.gameID = matchID.group('id')
if name == 'lastmod':
if el.text:
page.lastMod = dateutil.parser.parse(el.text.strip())
existing_page = self.__dbsession.query(Page).\
filter(Page.gameID == page.gameID, Page.type == self.__type).\
one_or_none()
if existing_page:
existing_page.loc = page.loc
existing_page.lastMod = page.lastMod
self.__dbsession.add(existing_page)
else:
self.__dbsession.add(page)
self.__dbsession.commit()
def main(argv):
object = "npc"
if len(argv) == 1:
object = argv[0]
if len(argv) > 1:
print("invalid number of argments")
sys.exit(1)
dbsession = Session()
spider = SitemapSpider(object, dbsession)
spider.crawl()
dbsession.close()
if __name__ == "__main__":
main(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment