Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
def crawl():
# 初始 API: https://api.hahow.in/api/courses?limit=12&status=PUBLISHED
# 接續 API: https://api.hahow.in/api/courses?latestId=54d5a117065a7e0e00725ac0&latestValue=2015-03-27T15:38:27.187Z&limit=30&status=PUBLISHED
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/59.0.3071.115 Safari/537.36'}
url = 'https://api.hahow.in/api/courses'
courses = list()
resp_courses = requests.get(url + '?limit=30&status=PUBLISHED', headers=headers).json()
while resp_courses: # 有回傳資料則繼續下一輪擷取
time.sleep(3) # 放慢爬蟲速度
courses += resp_courses
param = '?latestId={0}&latestValue={1}&limit=30&status=PUBLISHED'.format(
courses[-1]['_id'], courses[-1]['incubateTime'])
resp_courses = requests.get(url + param, headers=headers).json()
# 將課程資料存下來後續分析使用
with open('hahow_courses.json', 'w', encoding='utf-8') as f:
json.dump(courses, f, indent=2, sort_keys=True, ensure_ascii=False)
return courses
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.