Last active
September 25, 2017 16:23
-
-
Save coline-carle/e725c90b213e8ffbe8a133f66fb2ec93 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
certifi==2017.7.27.1 | |
chardet==3.0.4 | |
idna==2.6 | |
lxml==3.8.0 | |
python-dateutil==2.6.1 | |
requests==2.18.4 | |
six==1.10.0 | |
SQLAlchemy==1.1.14 | |
urllib3==1.22 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import re | |
import sys | |
import requests | |
from lxml import etree | |
import dateutil.parser | |
from db import Page, Session | |
class SitemapSpider(object): | |
sitemap = 'http://www.wowhead.com/sitemap' | |
npc_regexp = r'npc' | |
def __init__(self, object, dbsession): | |
self.__dbsession = dbsession | |
self.__object = object | |
def crawl(self): | |
print("crawling : {}".format(self.sitemap)) | |
response = requests.get(self.sitemap) | |
xmlp = etree.XMLParser(recover=True, | |
remove_comments=True, | |
resolve_entities=False) | |
self._root = etree.fromstring(response.content, parser=xmlp) | |
self.__iter__() | |
def __iter__(self): | |
# get all sub sitemaps | |
sub_sitemaps = [] | |
for elem in self._root.getchildren(): | |
for el in elem.getchildren(): | |
tag = el.tag | |
name = tag.split('}', 1)[1] if '}' in tag else tag | |
if name == 'loc': | |
sub_sitemaps.append(el.text.strip()) if el.text else '' | |
# filter out sitemaps we need | |
object_sitemaps = filter(lambda o: self.__object in o, sub_sitemaps) | |
object_spider = SubSitemapSpider(self.__object, self.__dbsession) | |
for object_sitemap in object_sitemaps: | |
object_spider.crawl(object_sitemap) | |
class SubSitemapSpider(object): | |
def __init__(self, type, dbsession): | |
regexp = r'\=(?P<id>\d{1,7})' | |
self.__regexp = re.compile(regexp) | |
self.__type = type | |
self.__dbsession = dbsession | |
def crawl(self, url): | |
print("crawling : {}".format(url)) | |
response = requests.get(url) | |
xmlp = etree.XMLParser(recover=True, | |
remove_comments=True, | |
resolve_entities=False) | |
self._root = etree.fromstring(response.content, parser=xmlp) | |
self.__iter__() | |
def __iter__(self): | |
for elem in self._root.getchildren(): | |
page = Page() | |
page.type = self.__type | |
for el in elem.getchildren(): | |
tag = el.tag | |
name = tag.split('}', 1)[1] if '}' in tag else tag | |
if name == 'loc': | |
if el.text: | |
page.loc = el.text.strip() | |
matchID = self.__regexp.search(page.loc) | |
page.gameID = matchID.group('id') | |
if name == 'lastmod': | |
if el.text: | |
page.lastMod = dateutil.parser.parse(el.text.strip()) | |
existing_page = self.__dbsession.query(Page).\ | |
filter(Page.gameID == page.gameID, Page.type == self.__type).\ | |
one_or_none() | |
if existing_page: | |
existing_page.loc = page.loc | |
existing_page.lastMod = page.lastMod | |
self.__dbsession.add(existing_page) | |
else: | |
self.__dbsession.add(page) | |
self.__dbsession.commit() | |
def main(argv): | |
object = "npc" | |
if len(argv) == 1: | |
object = argv[0] | |
if len(argv) > 1: | |
print("invalid number of argments") | |
sys.exit(1) | |
dbsession = Session() | |
spider = SitemapSpider(object, dbsession) | |
spider.crawl() | |
dbsession.close() | |
if __name__ == "__main__": | |
main(sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment