Skip to content

Instantly share code, notes, and snippets.

@AlJohri
Created February 5, 2016 21:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save AlJohri/5fe6f8d7133a9c588b71 to your computer and use it in GitHub Desktop.
Save AlJohri/5fe6f8d7133a9c588b71 to your computer and use it in GitHub Desktop.
urls = [
'http://www.baltimorenews.net/index.php/sid/234363921',
'http://www.baltimorenews.net/index.php/sid/234323971',
'http://www.atlantanews.net/index.php/sid/234323891',
'http://www.wpbf.com/news/funeral-held-for-gabby-desouza/33874572',
'http://www.tennessean.com/story/news/politics/2015/06/30/obama-focus-future-health-care-burwell-says/29540753/',
'http://www.atlantanews.net/index.php/sid/234323901',
'http://www.baltimorenews.net/index.php/sid/234323975',
'http://www.utsandiego.com/news/2015/jun/30/backcountry-lilac-development-opposition-general/',
'http://www.newsnet5.com/newsy/apples-ebook-pricing-scandal-a-long-road-to-a-small-fine',
'http://www.baltimorenews.net/index.php/sid/234323977',
'http://www.wsmv.com/story/29447077/trying-to-make-hitting-skid-disappear-maddon-hires-magician',
'http://www.atlantanews.net/index.php/sid/234323913',
'http://www.baltimorenews.net/index.php/sid/234323979',
'http://www.newsleader.com/story/sports/2015/06/30/virginia-baseball-fan-happy-proven-wrong/29540965/',
'http://www.baltimorenews.net/index.php/sid/234323981',
'http://www.baltimorenews.net/index.php/sid/234323987',
'http://www.mcall.com/entertainment/dining/mc-fratzolas-pizzeria-bethlehem-review-20150630-story.html',
'http://www.atlantanews.net/index.php/sid/234323911',
'http://www.baltimorenews.net/index.php/sid/234323985',
'http://www.atlantanews.net/index.php/sid/234323887',
'http://wtvr.com/2015/06/30/man-who-vandalized-confederate-statue-deeply-regrets-actions/',
'http://www.baltimorenews.net/index.php/sid/234323923',
'http://www.witn.com/home/headlines/Goldsboro-teens-charged-with-shooting-into-home-311067541.html',
'http://www.atlantanews.net/index.php/sid/234323995'
]
# ---------------------------------------------------------------- #
print("Synchronous")
import time, newspaper, hashlib
start_time = time.time()
for url in urls:
print(url)
article = newspaper.Article(url)
try:
article.download()
article.parse()
except newspaper.article.ArticleException:
continue
with open(hashlib.md5(url.encode('utf-8')).hexdigest() + ".txt", "w") as f:
f.write(article.text)
print("sync", time.time() - start_time, "\n")
# ---------------------------------------------------------------- #
print("Aynchronous")
import time, asyncio, aiohttp, newspaper, hashlib
start_time = time.time()
async def get_article(url):
print(url)
async with aiohttp.get(url) as response:
content = await response.read()
article = newspaper.Article(url)
article.set_html(content)
try:
article.parse()
except newspaper.article.ArticleException:
return
with open(hashlib.md5(url.encode('utf-8')).hexdigest() + ".txt", "w") as f:
f.write(article.text)
async def main(urls):
tasks = []
for url in urls:
task = asyncio.ensure_future(get_article(url))
tasks.append(task)
await asyncio.wait(tasks)
loop = asyncio.get_event_loop()
loop.run_until_complete(main(urls))
loop.close()
print("async", time.time() - start_time, "\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment