Created
October 17, 2013 09:57
-
-
Save brucemcpherson/7022247 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Option Explicit | |
' see http://ramblings.mcpher.com/Home/excelquirks/regular-expressions/regexscraping | |
' v1.0 | |
Public Sub halfMarathonScrape() | |
Dim cb As cBrowser, i As Long, headerRow As String | |
Dim URL As String, r As Range, heads As MatchCollection, _ | |
t As Long, matches As MatchCollection, match As match | |
Dim nodeList As IHTMLElementCollection, node As IHTMLElement, _ | |
rx As RegExp, lineLength As Long, p As Long, item As match, _ | |
dats As MatchCollection, data As match, k As Long | |
URL = "http://www.coolrunning.com/results/13/ma/Oct13_13thAn_set1.shtml" | |
' get the web page | |
Set cb = New cBrowser | |
cb.init().Navigate URL, , False | |
' its the only <pre> in the page | |
Set nodeList = cb.elementTags("pre") | |
Set node = nodeList(0) | |
' node should contain the results | |
' kind of wierd its not an html table | |
Debug.Assert InStr(1, node.innerText, "Half Marathon") > 0 | |
' so now we have to do a textparsing excercise, anchor on the "===" titles | |
' a simple regex will find the title distribution | |
Set rx = New RegExp | |
With rx | |
.ignorecase = True | |
.Global = True | |
.Pattern = "=.*(?=\s)" | |
End With | |
Set matches = rx.execute(node.innerText) | |
' now we should have a single match with all about where the title underlining is | |
' looks like this ===== ======== ==== ===== | |
' except that all but the first one are misaligned by 1 for the data | |
Debug.Assert matches.count = 1 | |
Set match = matches(0) | |
' we'll split that up into seperate sections | |
rx.Pattern = "=+(?=\s)" | |
Set heads = rx.execute(Mid(node.innerText, match.FirstIndex + 1, match.Length - 1)) | |
' we can get the header line | |
p = 0 | |
For i = match.FirstIndex To 1 Step -1 | |
If (Mid(node.innerText, i, 1) = vbLf) Then | |
p = p + 1 | |
If (p = 2) Then | |
' we're at the \n before the header row | |
headerRow = Mid(node.innerText, i + 1, match.FirstIndex - i - 2) | |
Exit For | |
End If | |
End If | |
Next i | |
'clear the worksheet | |
Application.Calculation = xlCalculationManual | |
Set r = firstCell(wholeSheet("marathonresults")) | |
r.Worksheet.Cells.ClearContents | |
p = 0 | |
For Each item In heads | |
r.Offset(0, p).value = Trim(Mid(headerRow, item.FirstIndex + 1, item.Length)) | |
p = p + 1 | |
Next item | |
' and the data | |
rx.Pattern = ".+(?=\n|$)" | |
' get the start point of the data | |
k = match.FirstIndex + match.Length | |
While (Mid(node.innerText, k, 1) <> vbLf) | |
k = k + 1 | |
Wend | |
Set dats = rx.execute(Mid(node.innerText, 1 + k)) | |
t = 0 | |
For Each data In dats | |
p = 0 | |
t = t + 1 | |
For Each item In heads | |
k = item.FirstIndex + 1 | |
' the headings, except the first, are actually misaligned by 1 | |
If k > 1 Then k = k - 1 | |
r.Offset(t, p).value = Trim(Mid(data.value, k, item.Length)) | |
p = p + 1 | |
Next item | |
Next data | |
' cleanup | |
cb.tearDown | |
Set cb = Nothing | |
Set dats = Nothing | |
Set heads = Nothing | |
Set matches = Nothing | |
Application.Calculation = xlCalculationAutomatic | |
End Sub | |
Public Sub vizMaratahonStatic() | |
' a static html rendering | |
tableToHtml "marathonresults", , , , "static" | |
End Sub | |
Public Sub vizMaratahonGoogle() | |
tableToHtml "marathonresults", , , , "google" | |
End Sub | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment