Mah1ndra/Crawler.py

## Crawler.py
#!/usr/bin/env python3
"""
Code takes urls from  cmd line.
ex: ./crawler.py https://packages.debian.org/jessie/armel/patch/download
The Crawler downalods all the debian patch files from the give link and stores in the specified outpath
References:
https://stackoverflow.com/questions/12996274/get-file-size-from-content-length-value-from-a-file-in-python-3-2
https://stackoverflow.com/questions/29827479/beautifulsoup-download-all-zip-files-from-google-patent-search
"""
from bs4 import BeautifulSoup
import requests
import sys

try:
    # outpath indicates where the downloaded patch files should be stored
    outpath = '/home/mahendra/projects/temp/patch/'
    url = sys.argv[1]
    mbyte = 1024*1024

    print('Reading: %s' %(url))
    html = requests.get(url).text
    soup = BeautifulSoup(html,"html.parser")

    print('Processing: %s'%(url))
    for patch in soup.findAll('a',href=True):
        patchurl = patch['href']
        if(patchurl.endswith('.deb')):
            outfname = outpath + patchurl.split('/')[-1]
            req = requests.get(patchurl, stream=True)
            if(req.status_code == requests.codes.ok):
                    fsize = int(len(req.content))
                    print('Downloading %s (%sMb)' % (outfname , fsize/mbyte))
                    with open(outfname, 'wb') as fd:
                        for chunk in req.iter_content(chunk_size = 1024):
                            if(chunk):
                                fd.write(chunk)
                        fd.close()
except KeyboardInterrupt:
    print("[Ctrl+C] pressed quitting Crawler")
    sys.exit(0)
	#!/usr/bin/env python3
	"""
	Code takes urls from cmd line.
	ex: ./crawler.py https://packages.debian.org/jessie/armel/patch/download
	The Crawler downalods all the debian patch files from the give link and stores in the specified outpath
	References:
	https://stackoverflow.com/questions/12996274/get-file-size-from-content-length-value-from-a-file-in-python-3-2
	https://stackoverflow.com/questions/29827479/beautifulsoup-download-all-zip-files-from-google-patent-search
	"""
	from bs4 import BeautifulSoup
	import requests
	import sys

	try:
	# outpath indicates where the downloaded patch files should be stored
	outpath = '/home/mahendra/projects/temp/patch/'
	url = sys.argv[1]
	mbyte = 1024*1024

	print('Reading: %s' %(url))
	html = requests.get(url).text
	soup = BeautifulSoup(html,"html.parser")

	print('Processing: %s'%(url))
	for patch in soup.findAll('a',href=True):
	patchurl = patch['href']
	if(patchurl.endswith('.deb')):
	outfname = outpath + patchurl.split('/')[-1]
	req = requests.get(patchurl, stream=True)
	if(req.status_code == requests.codes.ok):
	fsize = int(len(req.content))
	print('Downloading %s (%sMb)' % (outfname , fsize/mbyte))
	with open(outfname, 'wb') as fd:
	for chunk in req.iter_content(chunk_size = 1024):
	if(chunk):
	fd.write(chunk)
	fd.close()
	except KeyboardInterrupt:
	print("[Ctrl+C] pressed quitting Crawler")
	sys.exit(0)