jwlin/blog_hahow_crawler_1.py

## blog_hahow_crawler_1.py
def crawl():
    # 初始 API: https://api.hahow.in/api/courses?limit=12&status=PUBLISHED
    # 接續 API: https://api.hahow.in/api/courses?latestId=54d5a117065a7e0e00725ac0&latestValue=2015-03-27T15:38:27.187Z&limit=30&status=PUBLISHED
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                             'AppleWebKit/537.36 (KHTML, like Gecko) '
                             'Chrome/59.0.3071.115 Safari/537.36'}
    url = 'https://api.hahow.in/api/courses'
    courses = list()
    resp_courses = requests.get(url + '?limit=30&status=PUBLISHED', headers=headers).json()
    while resp_courses:  # 有回傳資料則繼續下一輪擷取
        time.sleep(3)  # 放慢爬蟲速度
        courses += resp_courses
        param = '?latestId={0}&latestValue={1}&limit=30&status=PUBLISHED'.format(
            courses[-1]['_id'], courses[-1]['incubateTime'])
        resp_courses = requests.get(url + param, headers=headers).json()
	# 將課程資料存下來後續分析使用
    with open('hahow_courses.json', 'w', encoding='utf-8') as f:
        json.dump(courses, f, indent=2, sort_keys=True, ensure_ascii=False)
    return courses
	def crawl():
	# 初始 API: https://api.hahow.in/api/courses?limit=12&status=PUBLISHED
	# 接續 API: https://api.hahow.in/api/courses?latestId=54d5a117065a7e0e00725ac0&latestValue=2015-03-27T15:38:27.187Z&limit=30&status=PUBLISHED
	headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
	'AppleWebKit/537.36 (KHTML, like Gecko) '
	'Chrome/59.0.3071.115 Safari/537.36'}
	url = 'https://api.hahow.in/api/courses'
	courses = list()
	resp_courses = requests.get(url + '?limit=30&status=PUBLISHED', headers=headers).json()
	while resp_courses: # 有回傳資料則繼續下一輪擷取
	time.sleep(3) # 放慢爬蟲速度
	courses += resp_courses
	param = '?latestId={0}&latestValue={1}&limit=30&status=PUBLISHED'.format(
	courses[-1]['_id'], courses[-1]['incubateTime'])
	resp_courses = requests.get(url + param, headers=headers).json()
	# 將課程資料存下來後續分析使用
	with open('hahow_courses.json', 'w', encoding='utf-8') as f:
	json.dump(courses, f, indent=2, sort_keys=True, ensure_ascii=False)
	return courses