Skip to content

Instantly share code, notes, and snippets.

@brucemcpherson
Created October 17, 2013 09:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save brucemcpherson/7022247 to your computer and use it in GitHub Desktop.
Save brucemcpherson/7022247 to your computer and use it in GitHub Desktop.
Option Explicit
' see http://ramblings.mcpher.com/Home/excelquirks/regular-expressions/regexscraping
' v1.0
Public Sub halfMarathonScrape()
Dim cb As cBrowser, i As Long, headerRow As String
Dim URL As String, r As Range, heads As MatchCollection, _
t As Long, matches As MatchCollection, match As match
Dim nodeList As IHTMLElementCollection, node As IHTMLElement, _
rx As RegExp, lineLength As Long, p As Long, item As match, _
dats As MatchCollection, data As match, k As Long
URL = "http://www.coolrunning.com/results/13/ma/Oct13_13thAn_set1.shtml"
' get the web page
Set cb = New cBrowser
cb.init().Navigate URL, , False
' its the only <pre> in the page
Set nodeList = cb.elementTags("pre")
Set node = nodeList(0)
' node should contain the results
' kind of wierd its not an html table
Debug.Assert InStr(1, node.innerText, "Half Marathon") > 0
' so now we have to do a textparsing excercise, anchor on the "===" titles
' a simple regex will find the title distribution
Set rx = New RegExp
With rx
.ignorecase = True
.Global = True
.Pattern = "=.*(?=\s)"
End With
Set matches = rx.execute(node.innerText)
' now we should have a single match with all about where the title underlining is
' looks like this ===== ======== ==== =====
' except that all but the first one are misaligned by 1 for the data
Debug.Assert matches.count = 1
Set match = matches(0)
' we'll split that up into seperate sections
rx.Pattern = "=+(?=\s)"
Set heads = rx.execute(Mid(node.innerText, match.FirstIndex + 1, match.Length - 1))
' we can get the header line
p = 0
For i = match.FirstIndex To 1 Step -1
If (Mid(node.innerText, i, 1) = vbLf) Then
p = p + 1
If (p = 2) Then
' we're at the \n before the header row
headerRow = Mid(node.innerText, i + 1, match.FirstIndex - i - 2)
Exit For
End If
End If
Next i
'clear the worksheet
Application.Calculation = xlCalculationManual
Set r = firstCell(wholeSheet("marathonresults"))
r.Worksheet.Cells.ClearContents
p = 0
For Each item In heads
r.Offset(0, p).value = Trim(Mid(headerRow, item.FirstIndex + 1, item.Length))
p = p + 1
Next item
' and the data
rx.Pattern = ".+(?=\n|$)"
' get the start point of the data
k = match.FirstIndex + match.Length
While (Mid(node.innerText, k, 1) <> vbLf)
k = k + 1
Wend
Set dats = rx.execute(Mid(node.innerText, 1 + k))
t = 0
For Each data In dats
p = 0
t = t + 1
For Each item In heads
k = item.FirstIndex + 1
' the headings, except the first, are actually misaligned by 1
If k > 1 Then k = k - 1
r.Offset(t, p).value = Trim(Mid(data.value, k, item.Length))
p = p + 1
Next item
Next data
' cleanup
cb.tearDown
Set cb = Nothing
Set dats = Nothing
Set heads = Nothing
Set matches = Nothing
Application.Calculation = xlCalculationAutomatic
End Sub
Public Sub vizMaratahonStatic()
' a static html rendering
tableToHtml "marathonresults", , , , "static"
End Sub
Public Sub vizMaratahonGoogle()
tableToHtml "marathonresults", , , , "google"
End Sub
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment