Skip to content

Instantly share code, notes, and snippets.

@cwilhit
Last active June 2, 2021 17:46
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save cwilhit/baa07732e55b79b6f118d143ea9b9646 to your computer and use it in GitHub Desktop.
Save cwilhit/baa07732e55b79b6f118d143ea9b9646 to your computer and use it in GitHub Desktop.
Scrapes over dockerhub, grabbing the pull counts of all images, the last time "latest" tag was updated, and gets the associated "FROM" image for each container.
#########################################################################################
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
#########################################################################################
Microsoft.PowerShell.Core\Set-StrictMode -Version Latest
#region variables
$urlDockerfile = "https://store.docker.com/api/content/v1/products/search"
$urlTags = "https://store.docker.com/v2/repositories/"
$dockerFileDir = (Get-Item -Path ".\" -Verbose).FullName
$dockerFiles = @()
#endregion variables
Write-Output "Scraping Dockerhub via Docker store..."
#First we gather all the images from the certified docker store area...
[string]$QueryString = 'page_size=100&q=&source=verified&type=image%2Cbundle'
$UriBuilder = New-Object System.UriBuilder -ArgumentList $urlDockerfile
$UriBuilder.Query = $QueryString
$BaseUri = $UriBuilder.Uri
$HttpContent = Invoke-WebRequest -Uri $BaseUri
$PageResponseContent = $HttpContent.Content | ConvertFrom-Json
#This will be the total number of dockerfiles in dockerhub store area
$numDockerfiles = $PageResponseContent.count
#For each hit to dockerhub API, we can fetch 100 items.
$itemsPerPage = 100
$iterations = [int]($numDockerfiles / $itemsPerPage)
For ($PageNumber=1; $PageNumber -le $iterations; $PageNumber++) {
#Get the page results from dockerhub API
$PageUri = $BaseUri.AbsoluteUri + "&page=$($PageNumber.ToString())"
$PageResponse = Invoke-WebRequest -Uri $PageUri -ErrorAction Stop
$PageResponseContent = $PageResponse.Content | ConvertFrom-Json
$items = $PageResponseContent.summaries
Foreach($_item in $items)
{
$PageContentUri = "https://store.docker.com/api/content/v1/products/images/" + $_item.slug
$PageResponse = Invoke-WebRequest -Uri $PageContentUri -ErrorAction Stop
$PageResponseContent = $PageResponse.Content | ConvertFrom-Json
#This grabs all container images in the store which also reside in dockerhub.
$repo = $PageResponseContent.plans[0].repositories[0]
if($repo.namespace -ne "store")
{
$repoName = $repo.namespace + "/" + $repo.reponame
$PageContentUri = "https://hub.docker.com/v2/repositories/" + $repoName + "/tags/?page=1&page_size=250"
$PageResponse = Invoke-WebRequest -Uri $PageContentUri -ErrorAction Stop
$tags = ($PageResponse.Content | ConvertFrom-Json).results
$latestDate = ""
$fromBaseImage = ""
#Get the date for the the 'latest' tag was updated
Foreach($_tag in $tags)
{
if($_tag.name -eq "latest")
{
$latestDate = $_tag.last_updated
break;
}
}
#Save the result
$dockerfileObj = New-Object System.Object
$dockerfileObj | Add-Member -type NoteProperty -name Name -Value $repoName
$dockerfileObj | Add-Member -type NoteProperty -name PullCount -Value $_item.popularity
$dockerfileObj | Add-Member -type NoteProperty -name CreatedAt -Value $_item.created_at
$dockerfileObj | Add-Member -type NoteProperty -name UpdatedAt -Value $latestDate
$dockerfileObj | Add-Member -type NoteProperty -name FromImage -Value $fromBaseImage
$dockerFiles += $dockerfileObj
}
}
}
Write-Output "Finished processing 'store' images"
#Write the outputted array of dockerfiles to file.
$fileDestination = $dockerFileDir + "\out.csv"
$dockerFiles | Export-Csv $fileDestination -NoTypeInformation
#Now we're going to gather all of the images from "community"
#Build the query string to capture total number of pages to click through
[string]$QueryString = 'page_size=100&q=%2B&source=community&type=image%2Cbundle'
$UriBuilder = New-Object System.UriBuilder -ArgumentList $urlDockerfile
$UriBuilder.Query = $QueryString
$BaseUri = $UriBuilder.Uri
$HttpContent = Invoke-WebRequest -Uri $BaseUri
$PageResponseContent = $HttpContent.Content | ConvertFrom-Json
#This will be the total number of dockerfiles in dockerhub
$numDockerfiles = $PageResponseContent.count
#For each hit to dockerhub API, we can fetch 100 items.
$itemsPerPage = 100
$iterations = [int]($numDockerfiles / $itemsPerPage)
For ($PageNumber=1; $PageNumber -le $iterations; $PageNumber++) {
Write-Output "Processing page $PageNumber"
#Get the page results from dockerhub API
$PageUri = $BaseUri.AbsoluteUri + "&page=$($PageNumber.ToString())"
$PageResponse = Invoke-WebRequest -Uri $PageUri -ErrorAction Stop
$PageResponseContent = $PageResponse.Content | ConvertFrom-Json
$items = $PageResponseContent.summaries
Foreach($_item in $items)
{
#Next we need to fetch the "latest" tag and see when it was last updated...
[string]$QueryString = 'page_size=100&page=1'
$tempAddr = $urlTags + $_item.name + "/tags/"
$UriBuilder = New-Object System.UriBuilder -ArgumentList $tempAddr
$UriBuilder.Query = $QueryString
$TagBaseUri = $UriBuilder.Uri
$HttpContent = Invoke-WebRequest -Uri $TagBaseUri
$PageResponseContent = $HttpContent.Content | ConvertFrom-Json
$tagsArray = $PageResponseContent.results
$latestDate = ""
#Get the latest date
For($j=0; $j -lt $tagsArray.Count; $j++)
{
if($tagsArray[$j].name -eq "latest")
{
$latestDate = $tagsArray[$j].last_updated
break;
}
}
#Get the base image that this dockerfile uses...
$fromBaseImage = "";
$tempAddr = $urlTags + $_item.name + "/dockerfile/"
$UriBuilder = New-Object System.UriBuilder -ArgumentList $tempAddr
$TagBaseUri = $UriBuilder.Uri
$HttpContent = Invoke-WebRequest -Uri $TagBaseUri
$PageResponseContent = $HttpContent.Content | ConvertFrom-Json
$fileLines = $PageResponseContent.contents -split '\r?\n'
for ($i = 0; $i -lt $fileLines.Count; $i++)
{
$_line = $fileLines[$i]
#We've found a command block
if($_line.StartsWith("FROM"))
{
$fromBaseImage = $_line.TrimStart("FROM").TrimStart().TrimEnd()
break;
}
}
#Make a new object representing the dockerfile and grab the values we want
$dockerfileObj = New-Object System.Object
$dockerfileObj | Add-Member -type NoteProperty -name Name -Value $_item.name
$dockerfileObj | Add-Member -type NoteProperty -name PullCount -Value $_item.popularity
$dockerfileObj | Add-Member -type NoteProperty -name CreatedAt -Value $_item.created_at
$dockerfileObj | Add-Member -type NoteProperty -name UpdatedAt -Value $latestDate
$dockerfileObj | Add-Member -type NoteProperty -name FromImage -Value $fromBaseImage
$dockerFiles += $dockerfileObj
}
#Write the outputted array of dockerfiles to file.
$fileDestination = $dockerFileDir + "\out.csv"
$dockerFiles | Export-Csv $fileDestination -NoTypeInformation
}
Write-Output "Scraping Complete!"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment