brucemcpherson/scraping.vba

## scraping.vba
Option Explicit
' see http://ramblings.mcpher.com/Home/excelquirks/regular-expressions/regexscraping
' v1.0
Public Sub halfMarathonScrape()

    Dim cb As cBrowser, i As Long, headerRow As String
    Dim URL As String, r As Range, heads As MatchCollection, _
        t As Long, matches As MatchCollection, match As match
    Dim nodeList As IHTMLElementCollection, node As IHTMLElement, _
        rx As RegExp, lineLength As Long, p As Long, item As match, _
        dats As MatchCollection, data As match, k As Long

    URL = "http://www.coolrunning.com/results/13/ma/Oct13_13thAn_set1.shtml"

    ' get the web page
    Set cb = New cBrowser
    cb.init().Navigate URL, , False

    ' its the only <pre> in the page
    Set nodeList = cb.elementTags("pre")

    Set node = nodeList(0)
    ' node should contain the results
    ' kind of wierd its not an html table
    Debug.Assert InStr(1, node.innerText, "Half Marathon") > 0

    ' so now we have to do a textparsing excercise, anchor on the "===" titles
    ' a simple regex will find the title distribution
    Set rx = New RegExp
    With rx
        .ignorecase = True
        .Global = True
        .Pattern = "=.*(?=\s)"
    End With
    Set matches = rx.execute(node.innerText)
    ' now we should have a single match with all about where the title underlining is
    ' looks like this ===== ======== ==== =====
    ' except that all but the first one are misaligned by 1 for the data
    Debug.Assert matches.count = 1
    Set match = matches(0)

    ' we'll split that up into seperate sections
    rx.Pattern = "=+(?=\s)"
    Set heads = rx.execute(Mid(node.innerText, match.FirstIndex + 1, match.Length - 1))

    ' we can get the header line
    p = 0
    For i = match.FirstIndex To 1 Step -1
        If (Mid(node.innerText, i, 1) = vbLf) Then
            p = p + 1
            If (p = 2) Then
                ' we're at the \n before the header row
                headerRow = Mid(node.innerText, i + 1, match.FirstIndex - i - 2)
                Exit For
            End If
        End If
    Next i


     'clear the worksheet
    Application.Calculation = xlCalculationManual
    Set r = firstCell(wholeSheet("marathonresults"))
    r.Worksheet.Cells.ClearContents

    p = 0
    For Each item In heads
        r.Offset(0, p).value = Trim(Mid(headerRow, item.FirstIndex + 1, item.Length))
        p = p + 1
    Next item

    ' and the data
    rx.Pattern = ".+(?=\n|$)"
    ' get the start point of the data
    k = match.FirstIndex + match.Length
    While (Mid(node.innerText, k, 1) <> vbLf)
        k = k + 1
    Wend
    Set dats = rx.execute(Mid(node.innerText, 1 + k))

    t = 0
    For Each data In dats
        p = 0
        t = t + 1
        For Each item In heads
            k = item.FirstIndex + 1
            ' the headings, except the first, are actually misaligned by 1
            If k > 1 Then k = k - 1
            r.Offset(t, p).value = Trim(Mid(data.value, k, item.Length))
            p = p + 1
        Next item
    Next data
    ' cleanup
    cb.tearDown
    Set cb = Nothing
    Set dats = Nothing
    Set heads = Nothing
    Set matches = Nothing
    Application.Calculation = xlCalculationAutomatic
End Sub

Public Sub vizMaratahonStatic()
    ' a static html rendering
    tableToHtml "marathonresults", , , , "static"

End Sub
Public Sub vizMaratahonGoogle()

    tableToHtml "marathonresults", , , , "google"
End Sub
	Option Explicit
	' see http://ramblings.mcpher.com/Home/excelquirks/regular-expressions/regexscraping
	' v1.0
	Public Sub halfMarathonScrape()

	Dim cb As cBrowser, i As Long, headerRow As String
	Dim URL As String, r As Range, heads As MatchCollection, _
	t As Long, matches As MatchCollection, match As match
	Dim nodeList As IHTMLElementCollection, node As IHTMLElement, _
	rx As RegExp, lineLength As Long, p As Long, item As match, _
	dats As MatchCollection, data As match, k As Long

	URL = "http://www.coolrunning.com/results/13/ma/Oct13_13thAn_set1.shtml"

	' get the web page
	Set cb = New cBrowser
	cb.init().Navigate URL, , False

	' its the only <pre> in the page
	Set nodeList = cb.elementTags("pre")

	Set node = nodeList(0)
	' node should contain the results
	' kind of wierd its not an html table
	Debug.Assert InStr(1, node.innerText, "Half Marathon") > 0

	' so now we have to do a textparsing excercise, anchor on the "===" titles
	' a simple regex will find the title distribution
	Set rx = New RegExp
	With rx
	.ignorecase = True
	.Global = True
	.Pattern = "=.*(?=\s)"
	End With
	Set matches = rx.execute(node.innerText)
	' now we should have a single match with all about where the title underlining is
	' looks like this ===== ======== ==== =====
	' except that all but the first one are misaligned by 1 for the data
	Debug.Assert matches.count = 1
	Set match = matches(0)

	' we'll split that up into seperate sections
	rx.Pattern = "=+(?=\s)"
	Set heads = rx.execute(Mid(node.innerText, match.FirstIndex + 1, match.Length - 1))

	' we can get the header line
	p = 0
	For i = match.FirstIndex To 1 Step -1
	If (Mid(node.innerText, i, 1) = vbLf) Then
	p = p + 1
	If (p = 2) Then
	' we're at the \n before the header row
	headerRow = Mid(node.innerText, i + 1, match.FirstIndex - i - 2)
	Exit For
	End If
	End If
	Next i


	'clear the worksheet
	Application.Calculation = xlCalculationManual
	Set r = firstCell(wholeSheet("marathonresults"))
	r.Worksheet.Cells.ClearContents

	p = 0
	For Each item In heads
	r.Offset(0, p).value = Trim(Mid(headerRow, item.FirstIndex + 1, item.Length))
	p = p + 1
	Next item

	' and the data
	rx.Pattern = ".+(?=\n\|$)"
	' get the start point of the data
	k = match.FirstIndex + match.Length
	While (Mid(node.innerText, k, 1) <> vbLf)
	k = k + 1
	Wend
	Set dats = rx.execute(Mid(node.innerText, 1 + k))

	t = 0
	For Each data In dats
	p = 0
	t = t + 1
	For Each item In heads
	k = item.FirstIndex + 1
	' the headings, except the first, are actually misaligned by 1
	If k > 1 Then k = k - 1
	r.Offset(t, p).value = Trim(Mid(data.value, k, item.Length))
	p = p + 1
	Next item
	Next data
	' cleanup
	cb.tearDown
	Set cb = Nothing
	Set dats = Nothing
	Set heads = Nothing
	Set matches = Nothing
	Application.Calculation = xlCalculationAutomatic
	End Sub

	Public Sub vizMaratahonStatic()
	' a static html rendering
	tableToHtml "marathonresults", , , , "static"

	End Sub
	Public Sub vizMaratahonGoogle()

	tableToHtml "marathonresults", , , , "google"
	End Sub