Last active
March 9, 2016 21:24
-
-
Save lshifr/f619437cbeebc368367e to your computer and use it in GitHub Desktop.
A very simplistic breadth - first HTML parser in Wolfram Mathematica
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
BeginPackage["HTMLParser`"] | |
ParseHTML::usage = "ParseHTML[s] parses a string of HTML code"; | |
HTMLContainer::usage = "HTMLContainer[tag] is an inert container for the contents of html tag"; | |
Begin["`Private`"] | |
listSplit[x_List,lengthlist_List,headlist_List]:= | |
MapThread[ | |
#1@@Take[x,#2]&, | |
{ | |
headlist, | |
Transpose[({Most[#1]+1,Rest[#1]}&)[FoldList[Plus,0,lengthlist]]] | |
} | |
]; | |
reconstructIntervals[listlen_Integer,ints_List]:= | |
Module[{missed,startint,lastint}, | |
startint=If[ints[[1,1]]==1,{},{1,ints[[1,1]]-1}]; | |
lastint=If[ints[[-1,-1]]==listlen,{},{ints[[-1,-1]]+1,listlen}]; | |
missed= | |
(If[#1[[2,1]]-#1[[1,2]]>1,{#1[[1,2]]+1,#1[[2,1]]-1},{}]&)/@ | |
Partition[ints,2,1]; | |
missed=Join[missed,{lastint}]; | |
Prepend[Flatten[Transpose[{ints,missed}],1],startint] | |
]; | |
groupElements[lst_List,poslist_List,headlist_List] /; OrderedQ[ | |
Flatten[Sort[poslist]] | |
]&&Length[headlist]==Length[poslist]:= | |
Module[{totalheadlist,allints,llist}, | |
totalheadlist= | |
Append[ | |
Flatten[ | |
Transpose[{Array[Sequence&,{Length[headlist]}],headlist}],1 | |
], | |
Sequence | |
]; | |
allints=reconstructIntervals[Length[lst],poslist]; | |
llist=(If[#1==={},0,1-Subtract@@#1]&)/@allints; | |
listSplit[lst,llist,totalheadlist] | |
]; | |
groupElements[h_[x__],poslist_List,headlist_List]:= | |
h[Sequence@@groupElements[{x},poslist,headlist]]; | |
groupElements[expr_,poslist_List,head_]:= | |
groupElements[expr,poslist,Table[head,{Length[poslist]}]]; | |
groupPositions[plist_List]:= | |
Reap[(Sow[Last[#1],{Most[#1]}]&)/@plist,_,List][[2]]; | |
processPosList::unmatchedMessageName="Unmatched lists `1` enountered!"; | |
processPosList[{openlist_List,closelist_List}]:= | |
Module[{opengroup,closegroup,poslist}, | |
{opengroup,closegroup}=groupPositions/@{openlist,closelist}; | |
poslist=Transpose[(Transpose[Sort[#1]]&)/@{opengroup,closegroup}]; | |
If[ | |
UnsameQ@@poslist[[1]] | |
, | |
Return[ | |
Message[ | |
processPosList::unmatchedMessageName,{openlist,closelist} | |
]; | |
{} | |
] | |
, | |
(*else*) | |
poslist=Transpose[{poslist[[1,1]],Transpose/@Transpose[poslist[[2]]]}] | |
] | |
]; | |
groupElementsNested[nested_,{openposlist_List,closeposlist_List},head_] /; Head[head]=!=List:= | |
Fold[ | |
Function[{x,y},MapAt[groupElements[#1,y[[2]],head]&,x,{y[[1]]}]], | |
nested, | |
Sort[ | |
processPosList[{openposlist,closeposlist}], | |
Length[#2[[1]]]<Length[#1[[1]]]& | |
] | |
]; | |
getAllUsedTags[text_String]:= | |
Module[{htmlTagsposlist,result,chars=Characters[text],x}, | |
htmlTagsposlist= | |
StringPosition[ | |
text,ShortestMatch["<"~~x__~~Whitespace|">"],Overlaps->True | |
]; | |
result= | |
Union[ | |
ToLowerCase/@ | |
Apply[ | |
StringJoin, | |
(Take[chars,{#1[[1]],#1[[2]]-1}]&)/@htmlTagsposlist, | |
{1} | |
] /. {"<br/" :> "<br"} | |
] | |
]; | |
refineTags[tags_List]:= | |
Module[{alphabet=Characters["abcdefghijklmnopqrstuvwxyz/"]}, | |
DeleteCases[tags,x_/;!MemberQ[alphabet,StringTake[x,{2,2}]]] | |
]; | |
getTagTitle[tag_String]:= | |
If[StringTake[tag,{2,2}]==="/",StringDrop[tag,{2}],tag]; | |
getPairedTags[tags_List]:= | |
Reverse/@ | |
Select[Reap[(Sow[#1,getTagTitle[#1]]&)/@tags,_,#2&][[2]],Length[#1]==2&]; | |
makeTagReplaceRules[pairedtags_List,unpairedtags_List]:= | |
Sort[ | |
Join[ | |
Apply[Rule,({#1,{StringDrop[#1,1],"Unpaired","Open"}}&)/@unpairedtags,{1}], | |
Apply[ | |
Rule, | |
Flatten[ | |
(Transpose[{#1,{{StringDrop[#1[[1]],1],"Open"},{StringDrop[#1[[2]],2],"Close"}}}]&)/@ | |
pairedtags, | |
1 | |
], | |
{1} | |
], | |
{">" -> {">","UnpairedClose"}} | |
], | |
StringLength[#1[[2,1]]] > StringLength[#2[[2,1]]]& | |
]; | |
getTagNames[pairedtags_List,unpairedtags_List]:= | |
(StringDrop[#1,1]&)/@Join[Transpose[pairedtags][[1]],unpairedtags]; | |
makeTagHashRules[tagnames_List]:= | |
Dispatch[MapThread[Rule,({#1,Range[Length[#1]]}&)[tagnames]]]; | |
tagSplit[text_String,{tagrules__Rule}]:= | |
DeleteCases[ | |
StringSplit[text,{tagrules}],x_/;StringMatchQ[x,Whitespace|""~~">"~~Whitespace|""] | |
]; | |
splitText[text_String,pairedtags_List,unpairedtags_List]:= | |
tagSplit[text,makeTagReplaceRules[pairedtags,unpairedtags]]; | |
preparse[text_]:= | |
Module[{step1}, | |
With[{pos=Position[text,{_,"Open"}|{_,_,"Open"},\[Infinity]]}, | |
step1= | |
ReplacePart[ | |
text, | |
HTMLContainer["attr"]/@Extract[text,pos+1], | |
pos+1, | |
List/@Range[Length[pos]] | |
] | |
] | |
]; | |
openCloseEnumerate[splittext_List,pairedtags_List,unpairedtags_List]:= | |
Module[{tagnames,taghashrules,tagtitlecounters,unpairedstack={},temptag}, | |
tagnames=getTagNames[pairedtags,unpairedtags]; | |
taghashrules=makeTagHashRules[tagnames]; | |
tagtitlecounters=Table[0,{Length[tagnames]}]; | |
( | |
Switch[ | |
#1, | |
{x_,"Open"}, | |
{#1[[1]],"Open",++tagtitlecounters[[#1[[1]]/.taghashrules]]}, | |
{x_,"Close"}, | |
{#1[[1]],"Close",tagtitlecounters[[#1[[1]]/.taghashrules]]--}, | |
{x_,"Unpaired","Open"}, | |
AppendTo[unpairedstack,#1]; | |
{#1[[1]],"Open",++tagtitlecounters[[#1[[1]]/.taghashrules]]}, | |
{"/>"|">","UnpairedClose"}, | |
If[ | |
Length[unpairedstack]>0 | |
, | |
temptag=unpairedstack[[-1]]; | |
unpairedstack=Most[unpairedstack]; | |
{temptag[[1]],"Close",tagtitlecounters[[temptag[[1]]/.taghashrules]]--} | |
, | |
(*else*) | |
#1 | |
], | |
_, | |
#1 | |
]& | |
)/@ | |
splittext | |
]; | |
getOpenCloseForm[text_String,pairedtags_List,unpairedtags_List]:= | |
openCloseEnumerate[ | |
preparse[splitText[text,pairedtags,unpairedtags]],pairedtags,unpairedtags | |
]; | |
makeTagDepthList[opencloseform_List,pairedtags_List,unpairedtags_List]:= | |
DeleteCases[ | |
({#1,Max[Cases[opencloseform,{#1,"Open"|"Close",x_Integer}:>x]]}&)/@ | |
getTagNames[pairedtags,unpairedtags], | |
{x_,-\[Infinity]} | |
]; | |
oneStepParse[parsed_,depth_Integer,tag_String,head_]:= | |
Module[{plist=(Position[parsed,{tag,#1,depth},\[Infinity]]&)/@{"Open","Close"}}, | |
groupElementsNested[parsed,plist,head] | |
]; | |
tagProcess[parseme_,{tag_String,maxdepth_Integer}]:= | |
Module[{hd=HTMLContainer[tag],result}, | |
With[{hd1=hd,ourtag=tag}, | |
hd1[{ourtag,"Open",n_},x__,{ourtag,"Close",n_}]:= | |
hd1[x]; | |
result=Fold[oneStepParse[#1,#2,ourtag,hd1]&,parseme,Range[maxdepth,1,-1]] | |
]; | |
Clear[hd]; | |
result | |
]; | |
openCloseProcess[opencloseform_List,pairedtags_List,unpairedtags_List]:= | |
Fold[tagProcess,opencloseform,makeTagDepthList[opencloseform,pairedtags,unpairedtags]]; | |
documentParse[text_String,pairedtags_List,unpairedtags_List]:= | |
openCloseProcess[ | |
getOpenCloseForm[text,pairedtags,unpairedtags],pairedtags,unpairedtags | |
]; | |
refineParsed[parsed_]:= | |
(If[#1==={},#1,First[#1]]&)[Cases[parsed,HTMLContainer["html"][___]]]; | |
parseText[text_String]:= | |
Module[{tags,paired,unpaired,parsed}, | |
tags=refineTags[getAllUsedTags[text]]; | |
paired=getPairedTags[tags]; | |
unpaired=Complement[tags,Flatten[paired]]; | |
parsed= | |
refineParsed[documentParse[text,paired,unpaired]]/.{">","UnpairedClose"}:>">"; | |
{parsed,paired,unpaired} | |
]; | |
removeLeaves[parsed_]:= | |
DeleteCases[parsed,_,{-1}]; | |
postProcess[parsed_]:= | |
DeleteCases[parsed,">"|"",\[Infinity]]; | |
Options[ParseHTML] = { | |
"PostProcess" -> False, | |
"RemoveLeaves" -> False | |
}; | |
ParseHTML[html_String, opts:OptionsPattern[]]:= | |
With[{ | |
pp = If[TrueQ["PostProcess"], postProcess, Identity], | |
rl = If[TrueQ["RemoveLeaves"], removeLeaves, Identity] | |
}, | |
pp @ rl @ parseText @ html | |
]; | |
End[] | |
EndPackage[] | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment