Skip to content

Instantly share code, notes, and snippets.

@cryzed
Created August 27, 2017 19:15
Show Gist options
  • Save cryzed/6ee7f4672d7b72d6e6d789a00903d891 to your computer and use it in GitHub Desktop.
Save cryzed/6ee7f4672d7b72d6e6d789a00903d891 to your computer and use it in GitHub Desktop.
import argparse
import functools
import sys
import urllib.parse
import xmlrpc.server
import newspaper
import selenium.webdriver
_requires_webdriver = set()
argument_parser = argparse.ArgumentParser()
argument_parser.add_argument('--port', type=int, default=8100)
@functools.lru_cache(1)
def get_webdriver():
return selenium.webdriver.Chrome()
def get_newspaper_article_html(url, html=None):
netloc = urllib.parse.urlsplit(url).netloc
article = newspaper.Article(url, keep_article_html=True)
if netloc in _requires_webdriver:
webdriver = get_webdriver()
webdriver.get(url)
html = webdriver.execute_script('return document.documentElement.outerHTML')
article.download(html)
article.parse()
return article.article_html
article.download(html)
article.parse()
if not article.article_html:
_requires_webdriver.add(netloc)
return get_newspaper_article_html(url, None)
return article.article_html
def main(arguments):
server = xmlrpc.server.SimpleXMLRPCServer(('localhost', arguments.port))
server.register_function(get_newspaper_article_html)
try:
print(f'Newspaper server is running on http://localhost:{arguments.port}/...')
server.serve_forever()
except KeyboardInterrupt:
print('Shutting down...')
if __name__ == '__main__':
arguments = argument_parser.parse_args()
sys.exit(main(arguments))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment