Created
September 19, 2017 11:57
-
-
Save clarksun/802581aa733d0538dc7e311fc2a7cc5f to your computer and use it in GitHub Desktop.
使用 #sqlalchemy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# frontera/utils/graphs/manager.py | |
DEFAULT_ENGINE = 'sqlite:///:memory:' | |
class CrawlGraphManager(object): | |
def __init__(self, engine=DEFAULT_ENGINE, autocommit=False, autoFlush=False, | |
echo=False, drop_all_tables=False, clear_content=False): | |
self.engine = create_engine(engine, echo=echo) | |
if drop_all_tables: | |
Base.metadata.drop_all(self.engine) | |
Base.metadata.create_all(self.engine) | |
self.Session = sessionmaker() | |
self.Session.configure(bind=self.engine, autocommit=autocommit, autoflush=autoflush) | |
self.session = self.Session() | |
if clear_content: | |
for name, table in Base.metadata.tables.items(): | |
self.session.execute(table.delete()) | |
@property | |
def pages(self): | |
return [page for page in CrawlPage.query(self.session).all()] | |
@property | |
def seeds(self): | |
return self.session.query(CrawlPage).filter_by(is_seed=True).all() | |
def add_page(self, url, status=200, n_redirects=0, is_seed=False, commit=True): | |
page, created = CrawlPage.get_or_create(self.session, url=url) | |
if created: | |
page.is_seed = is_seed | |
page.status = status | |
page.n_redirects = n_redirects | |
if commit: | |
self.session.commit() | |
return page | |
def add_link(self, page, url, commit=True, status=200): | |
link_page, created = CrawlPage.get_or_create(self.session, url=url) | |
if created: | |
link_page.status = status | |
if link_page not in page.links: | |
page.links.append(link_page) | |
if commit: | |
self.session.commit() | |
return link_page | |
def get_page(self, url): | |
return self.session.query(CrawlPage).filter_by(url=url).first() | |
def add_site(self, site, default_status=200, default_n_redirects=0): | |
pages = site.pages if isinstance(site, CrawlSiteData) else site | |
for i, (info, links) in enumerate(pages): | |
if isinstance(info, tuple): | |
if len(info) == 2: | |
status, page_url, n_redirects = (info[0], info[1], default_n_redirects) | |
else: | |
status, page_url, n_redirects = info | |
else: | |
status, page_url, n_redirects = (default_status, info, default_n_redirects) | |
page = self.add_page(url=page_url, status=status, n_redirects=n_redirects, is_seed=(i==0)) | |
for link_url in links: | |
self.add_link(page=page, url=link_url, status=default_status) | |
def add_site_list(self, graph, default_status=200, default_n_redirects=0): | |
sites = graph.sites if isinstance(graph, CrawlSiteListData) else graph | |
for site in sites: | |
self.add_site(site=site, default_status=default_status, default_n_redirects=default_n_redirects) | |
def save(self): | |
self.session.commit() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment