Last active
May 22, 2019 06:50
-
-
Save 510908220/fa5ce80f164515cf7e2e094799d45bda to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import os | |
import shutil | |
import json | |
BASE = os.path.dirname(__file__) | |
DATA_PATH = os.path.join(BASE,'data') | |
def _fetch(url): | |
name = url.split('/')[-1] | |
path = os.path.join(DATA_PATH, name) | |
r = requests.get(url, stream = True) | |
if r.status_code != 200: | |
return | |
with open(path, 'wb') as f: | |
r.raw.decode_content = True | |
shutil.copyfileobj(r.raw, f) | |
def fetch(url): | |
try: | |
_fetch(url) | |
except Exception as e: | |
print('error:',url,e) | |
def get_urls(page = 1): | |
url = 'http://adr.meizitu.net/wp-json/wp/v2/posts?page={}&per_page=100'.format(page) | |
try: | |
r = requests.get(url) | |
return [item['thumb_src'] for item in r.json()] | |
except Exception as e: | |
print('error:',e) | |
return [] | |
def main(): | |
start_page = 1 | |
end_page = 500 | |
while start_page < end_page: | |
print('page is:', start_page) | |
urls = get_urls(start_page) | |
for url in urls: | |
fetch(url) | |
start_page += 1 | |
if __name__ == "__main__": | |
main() | |
1. 将上面代码保存到文件如,spider.py | |
2. 安装requests库 | |
3. 在当前目录创建一个data文件夹。 | |
4. `python spider.py` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment