Created
February 9, 2020 19:41
-
-
Save schwastek/f05d8285b919a892819e3749335af844 to your computer and use it in GitHub Desktop.
Source code for: https://chwastek.eu/blog/web-scrapping-using-vba
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Option Explicit | |
Public Sub Main() | |
Dim htmlDocument As htmlDocument | |
Dim tableWithStories As HTMLTable | |
Dim rowsWithStories As IHTMLElementCollection | |
Dim storylinkAnchors As Collection | |
Dim output As Variant | |
Set htmlDocument = GetHtmlDocument("https://news.ycombinator.com/news") | |
Set tableWithStories = GetTableWithStories(htmlDocument) | |
Set rowsWithStories = GetRowsWithStories(tableWithStories) | |
Set storylinkAnchors = GetStorylinkAnchors(rowsWithStories) | |
Let output = BuildOutput(storylinkAnchors) | |
Call Display(output) | |
End Sub | |
Private Function GetHtmlDocument(url As String) As htmlDocument | |
Dim http As ServerXMLHTTP60 | |
Dim htmlDoc As htmlDocument | |
Set http = New ServerXMLHTTP60 | |
Set htmlDoc = New htmlDocument | |
With http | |
' Send an HTTP request | |
.Open bstrMethod:="GET", bstrUrl:=url, varAsync:=False | |
.send | |
' Check if the request has succeeded | |
If (.Status = 200) Then | |
' Inject raw HTML string into HTML document object | |
htmlDoc.body.innerHTML = .responseText | |
End If | |
End With | |
' Return | |
Set GetHtmlDocument = htmlDoc | |
End Function | |
Private Function GetTableWithStories(htmlDoc As htmlDocument) As HTMLTable | |
Dim allTables As IHTMLElementCollection | |
' Get all available HTML tables | |
Set allTables = htmlDoc.getElementsByTagName("table") | |
' Return the 3rd table containing Hacker News stories | |
Set GetTableWithStories = allTables.Item(2) | |
End Function | |
Private Function GetRowsWithStories(tableWithStories As HTMLTable) As IHTMLElementCollection | |
' <tr> rows with stories on Hacker News have class "athing" | |
Set GetRowsWithStories = tableWithStories.getElementsByClassName("athing") | |
End Function | |
Private Function GetStorylinkAnchors(tableRowsWithStories As IHTMLElementCollection) As Collection | |
Dim storyRow As HTMLTableRow | |
Dim storylink As HTMLAnchorElement | |
Dim storylinksCollection As Collection | |
' Create empty collection to hold <a> elements with title and link | |
Set storylinksCollection = New Collection | |
' Extract <a> elements with class "storylink" from <tr> rows | |
For Each storyRow In tableRowsWithStories | |
Set storylink = storyRow.getElementsByClassName("storylink").Item(0) | |
storylinksCollection.Add Item:=storylink | |
Next storyRow | |
' Return | |
Set GetStorylinkAnchors = storylinksCollection | |
End Function | |
Private Function BuildOutput(storylinks As Collection) As Variant | |
Dim output As Variant | |
Dim storylink As HTMLAnchorElement | |
Dim rowIndex As Long | |
ReDim output(1 To storylinks.Count, 1 To 2) | |
For Each storylink In storylinks | |
Let rowIndex = rowIndex + 1 | |
Let output(rowIndex, 1) = storylink.textContent | |
Let output(rowIndex, 2) = storylink.href | |
Next storylink | |
' Return | |
Let BuildOutput = output | |
End Function | |
Public Sub Display(output As Variant) | |
Dim targetWorksheet As Worksheet | |
Dim rowsQuantity As Long | |
Dim columnsQuantity As Long | |
Set targetWorksheet = ThisWorkbook.Worksheets(1) | |
Let rowsQuantity = UBound(output, 1) | |
Let columnsQuantity = 2 | |
With targetWorksheet | |
.UsedRange.Delete | |
.UsedRange.Clear | |
.Range(.Cells(1, 1), .Cells(rowsQuantity, columnsQuantity)) = output | |
End With | |
End Sub |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment