Skip to content

Instantly share code, notes, and snippets.

@NamPNQ
Created August 31, 2014 14:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save NamPNQ/4b7866b970676bb4f508 to your computer and use it in GitHub Desktop.
Save NamPNQ/4b7866b970676bb4f508 to your computer and use it in GitHub Desktop.
Python Scrap Tool
import requests
import re
import os
from urlparse import urljoin
def save(resource_url, resource_dir):
directories , _ = os.path.split(resource_dir)
if directories != '':
if not os.path.exists(directories):
os.makedirs(directories)
r = requests.get(resource_url, stream=True)
if r.status_code == 200:
with open(resource_dir, 'wb') as f:
for chunk in r.iter_content(1024):
f.write(chunk)
del r
def main():
res = requests.get(url)
if res.status_code == 200:
list_files = []
list_files.extend([res_file for res_file in re.findall(r'href=[\'\"](.*?)[\'\"]',res.text) if not res_file.startswith(('http','#')) and ':' not in res_file])
list_files.extend([res_file for res_file in re.findall(r'src=[\'\"](.*?)[\'\"]',res.text) if not res_file.startswith(('http','#'))])
for file in list_files:
save(urljoin(url,file),file)
save(url,"index.html")
if __name__ == '__main__':
global url
import argparse
parser = argparse.ArgumentParser(description='Scrap web.')
parser.add_argument('-u', help='put url in here')
args = parser.parse_args()
url = args.u
if url:
main()
else:
print 'Please input url'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment