Last active
January 4, 2018 22:29
-
-
Save tonmcg/1d09b39d2c66dd6dfbe27ce0ff5401fd to your computer and use it in GitHub Desktop.
M Language Helper Functions for HTML Parsing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
let GetImages = | |
(url as text) => | |
let | |
Protocol = Uri.Parts(url)[Scheme], | |
PathName = Uri.Parts(url)[Path], | |
Host = Uri.Parts(url)[Host], | |
Source = Text.From(Text.FromBinary(Web.Contents(url))), | |
GetTag = (Counter as number) => | |
let | |
CurrentTag = Text.BetweenDelimiters(Text.BetweenDelimiters(Source, "<img", ">", Counter), "src=""", """"), | |
TagHost = | |
if Text.StartsWith(CurrentTag, "https://") or Text.StartsWith(CurrentTag, "http://") then | |
"" | |
else | |
if Text.StartsWith(CurrentTag, "//") then | |
"https:" | |
else | |
Text.Combine({"https://", Host}) | |
in | |
if CurrentTag = "" or Text.EndsWith(CurrentTag, ".svg") then | |
{} | |
else | |
List.Combine({{Text.Combine({TagHost,CurrentTag})}, @GetTag(Counter+1)}), | |
Output = GetTag(0) | |
in | |
Output, | |
DefineDocs = [ | |
Documentation.Name = " Html.GetImages", | |
Documentation.Description = " Returns the URLs for all available images on a website", | |
Documentation.LongDescription = " Returns a list of URLs for all non-SVG formatted images on a user-supplied URL. The supplied URL must be fully-qualified, i.e., contains ""http://"" or ""https://www"". #(lf) Excludes SVG file types.", | |
Documentation.Category = " Html.Modification", | |
Documentation.Source = " Inspired by solutions after Chris Webb & Imke Feldmann", | |
Documentation.Author = " Tony McGovern: www.emdata.ai", | |
Documentation.Examples = { | |
[ | |
Description = " Return a list of all the current image URLs on Wikipedia's home page.", | |
Code = " GetImages(""https://en.wikipedia.org/wiki/Main_Page"")", | |
Result = " {""https://upload.wikimedia.org/wikipedia/commons/thumb/5/5d/Lawrence-Wetherby.jpg/100px-Lawrence-Wetherby.jpg"", ""https://upload.wikimedia.org/wikipedia/commons/thumb/f/f0/Guatemala_-_Chichi_Altar.jpg/150px-Guatemala_-_Chichi_Altar.jpg"", ... ""https://en.wikipedia.org/static/images/poweredby_mediawiki_88x31.png""}" | |
] | |
} | |
] | |
in | |
Value.ReplaceType( | |
GetImages, | |
Value.ReplaceMetadata( | |
Value.Type(GetImages), | |
DefineDocs | |
) | |
) |
- Replaced custom URL path functions with native M Uri.GetParts() function
- Deleted custom URL path functions
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
First commit