Karl-Han/scrap.py

## scrap.py
import newspaper
import json

source_list = []

def run():
    for url in source_list:
        scrap(url)

def scrap(url):
    config = newspaper.Config()
    config.memoize_articles = False
    source = newspaper.build(url, config)
    source.build()
    print("Totally {} articles".format(source.size()))

    for article in source.articles:
        article_url = article.url
        article.build()
        link = article_url.split("/")

        filename = link[-1] + ".json"
        result = {}

        result['title'] = article.title
        result['author'] = article.authors
        result['url'] = article.url
        result['text'] = article.text

        with open("data/" + filename, "w") as f:
            json.dump(result, f)

if __name__ == "__main__":
    source_list.append("http://www.chinadaily.com.cn/")
    run()
	import newspaper
	import json

	source_list = []

	def run():
	for url in source_list:
	scrap(url)

	def scrap(url):
	config = newspaper.Config()
	config.memoize_articles = False
	source = newspaper.build(url, config)
	source.build()
	print("Totally {} articles".format(source.size()))

	for article in source.articles:
	article_url = article.url
	article.build()
	link = article_url.split("/")

	filename = link[-1] + ".json"
	result = {}

	result['title'] = article.title
	result['author'] = article.authors
	result['url'] = article.url
	result['text'] = article.text

	with open("data/" + filename, "w") as f:
	json.dump(result, f)

	if __name__ == "__main__":
	source_list.append("http://www.chinadaily.com.cn/")
	run()