Skip to content

Instantly share code, notes, and snippets.

@Tucker-Eric
Created October 5, 2016 18:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Tucker-Eric/30736bbdf8c8496fb43a746c4adbb8ad to your computer and use it in GitHub Desktop.
Save Tucker-Eric/30736bbdf8c8496fb43a746c4adbb8ad to your computer and use it in GitHub Desktop.
HTML Image Scraper
'*****************************************************************
Const adSaveCreateOverWrite = 2
Const adTypeBinary = 1
Dim htmlSrc, dstDir
htmlSrc = InputBox("Enter Html File Name") & ".html"
dstDir = InputBox("Enter Folder Name To Place Images")
'*****************************************************************
'** Download the image
' strResult = GetImage(strSource, strDest)
getImgTagUrl htmlSrc, dstDir
' If strResult = "OK" Then
' wscript.quit(0)
' Else
' wscript.quit(1)
' End If
Function GetImage(strPath, dstDir)
Dim objXMLHTTP, nF, arr, objFSO, objFile, re
Dim objRec, objStream
Dim strDest
Set re = new RegExp
re.pattern = "^.*\/([\w-]+\.\w+)$"
strDest = dstDir & "\" & re.Replace(strPath, "$1")
'create XMLHTTP component
Set objXMLHTTP = CreateObject("Microsoft.XMLHTTP")
'get the image specified by strPath
objXMLHTTP.Open "GET", strPath, False
objXMLHTTP.Send
'check if retrieval was successful
If objXMLHTTP.statusText = "OK" Then
'create binary stream to write image output
Set objStream = CreateObject("ADODB.Stream")
objStream.Type = adTypeBinary
objStream.Open
objStream.Write objXMLHTTP.ResponseBody
objStream.SavetoFile strDest, adSaveCreateOverwrite
objStream.Close
GetImage = "OK"
Else
GetImage = objXMLHTTP.statusText
End If
End Function
Function getImgTagURL(htmlFile, dstDir)
Dim HTMLstring
Set objFS = CreateObject("Scripting.FileSystemObject")
Set file = objFS.OpenTextFile(htmlFile, 1)
If NOT objFS.FolderExists(dstDir) Then
objFS.CreateFolder(dstDir)
End If
HTMLstring = file.ReadAll
Set RegEx = New RegExp
With RegEx
.Pattern = "src=[\""\']([^\""\']+)"
.IgnoreCase = True
.Global = True
End With
Set Matches = RegEx.Execute(HTMLstring)
Set isJs = New RegExp
isJs.Pattern = "\.js$"
Set isHttp = New RegExp
isHttp.Pattern = "^src=[\""\'](https?\:)?\/\/"
'Iterate through the Matches collection.
URL = ""
For Each Match in Matches
'We only want the first match.
URL = Match.Value
If (isJs.Test(URL) = False) AND (isHttp.Test(URL) = True) Then
GetImage Replace(URL, "src=""", ""), dstDir
End If
' Exit For
Next
'Clean up
Set Match = Nothing
Set RegEx = Nothing
Set isJs = Nothing
Set isHttp = Nothing
' src=" is hanging on the front, so we will replace it with nothing
End Function
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment