Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@tonmcg
Last active May 14, 2018 19:13
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save tonmcg/1173759b95943b2b9ed290b9edbe74d3 to your computer and use it in GitHub Desktop.
Save tonmcg/1173759b95943b2b9ed290b9edbe74d3 to your computer and use it in GitHub Desktop.
M Language Helper Functions for HTML Parsing
let GetTables =
(url as text) =>
let
DOM = Text.FromBinary(Web.Contents(url)),
DOCTYPE =
let
DOCTag = "<!" & Text.BetweenDelimiters(DOM, "<!", ">") & ">"
in
DOCTag,
HTMLOpeningTag =
let
HtmlTag = "<html" & Text.BetweenDelimiters(DOM, "<html", ">") & ">"
in
HtmlTag,
HEAD =
let
HeadString = "<head" & Text.BetweenDelimiters(DOM, "<head", "</head>") & "</head>"
in
HeadString,
BODYOpeningTag =
let
BodyString = "<body>"
in
BodyString,
GetTables = (n as number) =>
let
CurrentTable = Text.BetweenDelimiters(DOM, "<table", "</table>", n)
in
if CurrentTable = "" then
""
else
Text.Combine({ "<table", CurrentTable, "</table>", @GetTables(n+1) }),
TABLES = GetTables(0),
HTML = Text.Combine({DOCTYPE, HTMLOpeningTag, HEAD, BODYOpeningTag, TABLES, "</body></html>"}),
Page = Web.Page(HTML),
Tables = Table.SelectRows(Page, each ([Source] = "Table"))
in
Tables,
DefineDocs = [
Documentation.Name = " HTML.GetTables",
Documentation.Description = " Returns the contents of all table nodes within the HTML document broken into its constituent structures",
Documentation.LongDescription = " Returns the contents of all table nodes within the HTML document broken into its constituent structures of a user-supplied URL.",
Documentation.Category = " Html.Modification",
Documentation.Source = " Inspired by solutions after Imke Feldmann",
Documentation.Author = " Tony McGovern: www.emdata.ai",
Documentation.Examples = {
[
Description = "",
Code = " GetTables(""https://www.census.gov/geo/reference/ansi_statetables.html"")",
Result = ""
]
}
]
in
Value.ReplaceType(
GetTables,
Value.ReplaceMetadata(
Value.Type(GetTables),
DefineDocs
)
)
@tonmcg
Copy link
Author

tonmcg commented Feb 11, 2018

First commit

@tonmcg
Copy link
Author

tonmcg commented Feb 11, 2018

Added function to retrieve all table node content for each <table> tag

@tonmcg
Copy link
Author

tonmcg commented Feb 11, 2018

Fixed hard coded reference to URL

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment