-
-
Save default-writer/43dcdf17d459904c993893e83f5b709a to your computer and use it in GitHub Desktop.
Parsing HTML Tables, in reply to https://www.reddit.com/r/PowerShell/comments/4gubc3/is_it_possible_to_web_scrape_the_text_in_the/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$Uri = "http://midnightfreddie.com/reddit/simpletable.html" | |
$InfoPage = Invoke-Webrequest -Uri $Uri | |
# Iterate over each <tbody> which contain all the body rows for each table | |
$InfoPage.ParsedHtml.getElementsByTagName("tbody") | ForEach-Object { | |
$Headers = $null | |
# Might need to uncomment the following line depending on table being parsed | |
# And if there is more than one table, need a way to get the right headers for each table | |
#$Headers = @("IP Address", "Hostname", "HW Address", "Device Type") | |
# Iterate over each <tr> in this table body | |
$_.getElementsByTagName("tr") | ForEach-Object { | |
# Select/get the <td>'s, but just grab the InnerText and make them an array | |
$OutputRow = $_.getElementsByTagName("td") | Select-Object -ExpandProperty InnerText | |
# If $Headers not defined, this must be the first row and must contain headers | |
# Otherwise create an object out of the row by building up a hash and then using it to make an object | |
# These objects can be piped to a lot of different cmdlets, like Out-GridView, ConvertTo-Csv, Format-Table, etc. | |
if ($Headers) { | |
$OutputHash = [ordered]@{} | |
for($i=0;$i -lt $OutputRow.Count;$i++) { | |
$OutputHash[$Headers[$i]] = $OutputRow[$i] | |
} | |
New-Object psobject -Property $OutputHash | |
} else { | |
$Headers = $OutputRow | |
} | |
} | |
} | |
# IP Address : 10.0.0.5 | |
# Hostname : pokey | |
# FQDN : pokey.example.tld | |
# HW Address : 012345679abcdef | |
# Device Type : Static-Your-Momma |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!doctype html> | |
<html> | |
<head> | |
<title>Simple Table</title> | |
</head> | |
<body> | |
<h1>Simple Table</h1> | |
To parse for data | |
<table> | |
<tr> | |
<td>IP Address</td> | |
<td>Hostname</td> | |
<td>FQDN</td> | |
<td>HW Address</td> | |
<td>Device Type</td> | |
</tr> | |
<tr> | |
<td>10.0.0.5</td> | |
<td>pokey</td> | |
<td>pokey.example.tld</td> | |
<td>012345679abcdef</td> | |
<td>Static-Your-Momma</td> | |
</tr> | |
</table> | |
</body> | |
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$Uri = "http://midnightfreddie.com/reddit/simpletable.html" | |
$InfoPage = Invoke-Webrequest -Uri $Uri | |
$Xml = [xml]($InfoPage.Content.Split("`n") | Where-Object { -not ($_ -imatch '<!doctype') }) | |
# $Xml = [xml](Get-Content .\simpletable.html | Where-Object { -not ($_ -imatch '<!doctype') } | Out-String) | |
# Iterate over each <table> which contain all the body rows for each table | |
$Xml.SelectNodes("//table") | ForEach-Object { | |
# Clever: this will be null if there are no <th>'s and then the first row of <td>'s will be headers | |
$Headers = $_.SelectNodes("tr/th") | Select-Object -ExpandProperty InnerText | |
# Iterate over each <tr> in this table body | |
$_.SelectNodes("tr") | ForEach-Object { | |
# Select/get the <td>'s, but just grab the InnerText and make them an array | |
$OutputRow = $_.SelectNodes("td") | Select-Object -ExpandProperty InnerText | |
# If $Headers not defined, this must be the first row and must contain headers | |
# Otherwise create an object out of the row by building up a hash and then using it to make an object | |
# These objects can be piped to a lot of different cmdlets, like Out-GridView, ConvertTo-Csv, Format-Table, etc. | |
if ($Headers) { | |
$OutputHash = [ordered]@{} | |
for($i=0;$i -lt $OutputRow.Count;$i++) { | |
$OutputHash[$Headers[$i]] = $OutputRow[$i] | |
} | |
New-Object psobject -Property $OutputHash | |
} else { | |
$Headers = $OutputRow | |
} | |
} | |
} | |
# IP Address : 10.0.0.5 | |
# Hostname : pokey | |
# FQDN : pokey.example.tld | |
# HW Address : 012345679abcdef | |
# Device Type : Static-Your-Momma |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment