Skip to content

Instantly share code, notes, and snippets.

@Karl-Han
Created December 29, 2019 03:35
Show Gist options
  • Save Karl-Han/e792edda9e6d17b5968d9e59a6441175 to your computer and use it in GitHub Desktop.
Save Karl-Han/e792edda9e6d17b5968d9e59a6441175 to your computer and use it in GitHub Desktop.
import newspaper
import json
source_list = []
def run():
for url in source_list:
scrap(url)
def scrap(url):
config = newspaper.Config()
config.memoize_articles = False
source = newspaper.build(url, config)
source.build()
print("Totally {} articles".format(source.size()))
for article in source.articles:
article_url = article.url
article.build()
link = article_url.split("/")
filename = link[-1] + ".json"
result = {}
result['title'] = article.title
result['author'] = article.authors
result['url'] = article.url
result['text'] = article.text
with open("data/" + filename, "w") as f:
json.dump(result, f)
if __name__ == "__main__":
source_list.append("http://www.chinadaily.com.cn/")
run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment