Skip to content

Instantly share code, notes, and snippets.

@bityob
Created July 19, 2017 05:19
Show Gist options
  • Save bityob/a1aa8b5a07de4e8d4143c9749ac5dd5b to your computer and use it in GitHub Desktop.
Save bityob/a1aa8b5a07de4e8d4143c9749ac5dd5b to your computer and use it in GitHub Desktop.
Download nuget packages with all dependencies recursively
# -*- coding: utf-8 -*-
import scrapy
import os
urls = {}
class NugetSpider(scrapy.Spider):
global urls
name = 'nuget'
allowed_domains = ['nuget.org']
urls = start_urls = [
'http://nuget.org/packages/Microsoft.Extensions.Caching.Memory/',
'https://www.nuget.org/packages/Microsoft.EntityFrameworkCore.Sqlite/',
'https://www.nuget.org/packages/Moq/'
]
max_urls = 300
def parse(self, response):
self.log(f"Parsing... {len(urls)}")
# set download package
downloag_pkgs_url = response \
.css('div#sideColumn ul li a[title="Download the raw nupkg file."]::attr(href)') \
.extract_first()
pkgs_folder = "packages"
download_file = downloag_pkgs_url.split('/')[-1]
download_fullpath = os.path.join(pkgs_folder, download_file)
if not os.path.isfile(download_fullpath):
yield response.follow(downloag_pkgs_url, callback=self.parse_file)
# get dependencies url
found_urls = response.css('#dependencySets ul li a::attr(href)').extract()
curr_page = response.url.split('/')[-1]
for furl in found_urls:
# furl = response.urljoin(furl)
self.log(f"{curr_page} - {furl} found")
# validate didn't reach max urls
if len(urls) > self.max_urls:
break
if furl not in urls:
urls.append(furl)
self.log(f"{curr_page} - {furl} added")
yield response.follow(furl, callback=self.parse)
else:
self.log(f"{curr_page} - {furl} already exists")
def parse_file(self, response):
pkgs_folder = "packages"
curr_file = response.url.split('/')[-1]
self.log(f"Saving file... {curr_file}")
fullpath = os.path.join(pkgs_folder, curr_file)
if os.path.isfile(fullpath):
self.log(f"{curr_file} Already saved")
else:
with open(fullpath, "wb") as f:
f.write(response.body)
self.log(f"{curr_file} File saved")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment