Scrapes over dockerhub, grabbing the pull counts of all images, the last time "latest" tag was updated, and gets the associated "FROM" image for each container.
# Copyright (c) Microsoft Corporation. All rights reserved.
Microsoft.PowerShell.Core\Set-StrictMode -Version Latest
#region variables
$urlDockerfile = ""
$urlTags = ""
$dockerFileDir = (Get-Item -Path ".\" -Verbose).FullName
$dockerFiles = @()
#endregion variables
Write-Output "Scraping Dockerhub via Docker store..."
#First we gather all the images from the certified docker store area...
[string]$QueryString = 'page_size=100&q=&source=verified&type=image%2Cbundle'
$UriBuilder = New-Object System.UriBuilder -ArgumentList $urlDockerfile
$UriBuilder.Query = $QueryString
$BaseUri = $UriBuilder.Uri
$HttpContent = Invoke-WebRequest -Uri $BaseUri
$PageResponseContent = $HttpContent.Content | ConvertFrom-Json
#This will be the total number of dockerfiles in dockerhub store area
$numDockerfiles = $PageResponseContent.count
#For each hit to dockerhub API, we can fetch 100 items.
$itemsPerPage = 100
$iterations = [int]($numDockerfiles / $itemsPerPage)
For ($PageNumber=1; $PageNumber -le $iterations; $PageNumber++) {
#Get the page results from dockerhub API
$PageUri = $BaseUri.AbsoluteUri + "&page=$($PageNumber.ToString())"
$PageResponse = Invoke-WebRequest -Uri $PageUri -ErrorAction Stop
$PageResponseContent = $PageResponse.Content | ConvertFrom-Json
$items = $PageResponseContent.summaries
Foreach($_item in $items)
$PageContentUri = "" + $_item.slug
$PageResponse = Invoke-WebRequest -Uri $PageContentUri -ErrorAction Stop
$PageResponseContent = $PageResponse.Content | ConvertFrom-Json
#This grabs all container images in the store which also reside in dockerhub.
$repo = $PageResponseContent.plans[0].repositories[0]
if($repo.namespace -ne "store")
$repoName = $repo.namespace + "/" + $repo.reponame
$PageContentUri = "" + $repoName + "/tags/?page=1&page_size=250"
$PageResponse = Invoke-WebRequest -Uri $PageContentUri -ErrorAction Stop
$tags = ($PageResponse.Content | ConvertFrom-Json).results
$latestDate = ""
$fromBaseImage = ""
#Get the date for the the 'latest' tag was updated
Foreach($_tag in $tags)
if($ -eq "latest")
$latestDate = $_tag.last_updated
#Save the result
$dockerfileObj = New-Object System.Object
$dockerfileObj | Add-Member -type NoteProperty -name Name -Value $repoName
$dockerfileObj | Add-Member -type NoteProperty -name PullCount -Value $_item.popularity
$dockerfileObj | Add-Member -type NoteProperty -name CreatedAt -Value $_item.created_at
$dockerfileObj | Add-Member -type NoteProperty -name UpdatedAt -Value $latestDate
$dockerfileObj | Add-Member -type NoteProperty -name FromImage -Value $fromBaseImage
$dockerFiles += $dockerfileObj
Write-Output "Finished processing 'store' images"
#Write the outputted array of dockerfiles to file.
$fileDestination = $dockerFileDir + "\out.csv"
$dockerFiles | Export-Csv $fileDestination -NoTypeInformation
#Now we're going to gather all of the images from "community"
#Build the query string to capture total number of pages to click through
[string]$QueryString = 'page_size=100&q=%2B&source=community&type=image%2Cbundle'
$UriBuilder = New-Object System.UriBuilder -ArgumentList $urlDockerfile
$UriBuilder.Query = $QueryString
$BaseUri = $UriBuilder.Uri
$HttpContent = Invoke-WebRequest -Uri $BaseUri
$PageResponseContent = $HttpContent.Content | ConvertFrom-Json
#This will be the total number of dockerfiles in dockerhub
$numDockerfiles = $PageResponseContent.count
#For each hit to dockerhub API, we can fetch 100 items.
$itemsPerPage = 100
$iterations = [int]($numDockerfiles / $itemsPerPage)
For ($PageNumber=1; $PageNumber -le $iterations; $PageNumber++) {
Write-Output "Processing page $PageNumber"
#Get the page results from dockerhub API
$PageUri = $BaseUri.AbsoluteUri + "&page=$($PageNumber.ToString())"
$PageResponse = Invoke-WebRequest -Uri $PageUri -ErrorAction Stop
$PageResponseContent = $PageResponse.Content | ConvertFrom-Json
$items = $PageResponseContent.summaries
Foreach($_item in $items)
#Next we need to fetch the "latest" tag and see when it was last updated...
[string]$QueryString = 'page_size=100&page=1'
$tempAddr = $urlTags + $ + "/tags/"
$UriBuilder = New-Object System.UriBuilder -ArgumentList $tempAddr
$UriBuilder.Query = $QueryString
$TagBaseUri = $UriBuilder.Uri
$HttpContent = Invoke-WebRequest -Uri $TagBaseUri
$PageResponseContent = $HttpContent.Content | ConvertFrom-Json
$tagsArray = $PageResponseContent.results
$latestDate = ""
#Get the latest date
For($j=0; $j -lt $tagsArray.Count; $j++)
if($tagsArray[$j].name -eq "latest")
$latestDate = $tagsArray[$j].last_updated
#Get the base image that this dockerfile uses...
$fromBaseImage = "";
$tempAddr = $urlTags + $ + "/dockerfile/"
$UriBuilder = New-Object System.UriBuilder -ArgumentList $tempAddr
$TagBaseUri = $UriBuilder.Uri
$HttpContent = Invoke-WebRequest -Uri $TagBaseUri
$PageResponseContent = $HttpContent.Content | ConvertFrom-Json
$fileLines = $PageResponseContent.contents -split '\r?\n'
for ($i = 0; $i -lt $fileLines.Count; $i++)
$_line = $fileLines[$i]
#We've found a command block
$fromBaseImage = $_line.TrimStart("FROM").TrimStart().TrimEnd()
#Make a new object representing the dockerfile and grab the values we want
$dockerfileObj = New-Object System.Object
$dockerfileObj | Add-Member -type NoteProperty -name Name -Value $
$dockerfileObj | Add-Member -type NoteProperty -name PullCount -Value $_item.popularity
$dockerfileObj | Add-Member -type NoteProperty -name CreatedAt -Value $_item.created_at
$dockerfileObj | Add-Member -type NoteProperty -name UpdatedAt -Value $latestDate
$dockerfileObj | Add-Member -type NoteProperty -name FromImage -Value $fromBaseImage
$dockerFiles += $dockerfileObj
#Write the outputted array of dockerfiles to file.
$fileDestination = $dockerFileDir + "\out.csv"
$dockerFiles | Export-Csv $fileDestination -NoTypeInformation
Write-Output "Scraping Complete!"
