Skip to content

Instantly share code, notes, and snippets.

@lshifr
Last active March 9, 2016 21:24
Show Gist options
  • Save lshifr/f619437cbeebc368367e to your computer and use it in GitHub Desktop.
Save lshifr/f619437cbeebc368367e to your computer and use it in GitHub Desktop.
A very simplistic breadth - first HTML parser in Wolfram Mathematica
BeginPackage["HTMLParser`"]
ParseHTML::usage = "ParseHTML[s] parses a string of HTML code";
HTMLContainer::usage = "HTMLContainer[tag] is an inert container for the contents of html tag";
Begin["`Private`"]
listSplit[x_List,lengthlist_List,headlist_List]:=
MapThread[
#1@@Take[x,#2]&,
{
headlist,
Transpose[({Most[#1]+1,Rest[#1]}&)[FoldList[Plus,0,lengthlist]]]
}
];
reconstructIntervals[listlen_Integer,ints_List]:=
Module[{missed,startint,lastint},
startint=If[ints[[1,1]]==1,{},{1,ints[[1,1]]-1}];
lastint=If[ints[[-1,-1]]==listlen,{},{ints[[-1,-1]]+1,listlen}];
missed=
(If[#1[[2,1]]-#1[[1,2]]>1,{#1[[1,2]]+1,#1[[2,1]]-1},{}]&)/@
Partition[ints,2,1];
missed=Join[missed,{lastint}];
Prepend[Flatten[Transpose[{ints,missed}],1],startint]
];
groupElements[lst_List,poslist_List,headlist_List] /; OrderedQ[
Flatten[Sort[poslist]]
]&&Length[headlist]==Length[poslist]:=
Module[{totalheadlist,allints,llist},
totalheadlist=
Append[
Flatten[
Transpose[{Array[Sequence&,{Length[headlist]}],headlist}],1
],
Sequence
];
allints=reconstructIntervals[Length[lst],poslist];
llist=(If[#1==={},0,1-Subtract@@#1]&)/@allints;
listSplit[lst,llist,totalheadlist]
];
groupElements[h_[x__],poslist_List,headlist_List]:=
h[Sequence@@groupElements[{x},poslist,headlist]];
groupElements[expr_,poslist_List,head_]:=
groupElements[expr,poslist,Table[head,{Length[poslist]}]];
groupPositions[plist_List]:=
Reap[(Sow[Last[#1],{Most[#1]}]&)/@plist,_,List][[2]];
processPosList::unmatchedMessageName="Unmatched lists `1` enountered!";
processPosList[{openlist_List,closelist_List}]:=
Module[{opengroup,closegroup,poslist},
{opengroup,closegroup}=groupPositions/@{openlist,closelist};
poslist=Transpose[(Transpose[Sort[#1]]&)/@{opengroup,closegroup}];
If[
UnsameQ@@poslist[[1]]
,
Return[
Message[
processPosList::unmatchedMessageName,{openlist,closelist}
];
{}
]
,
(*else*)
poslist=Transpose[{poslist[[1,1]],Transpose/@Transpose[poslist[[2]]]}]
]
];
groupElementsNested[nested_,{openposlist_List,closeposlist_List},head_] /; Head[head]=!=List:=
Fold[
Function[{x,y},MapAt[groupElements[#1,y[[2]],head]&,x,{y[[1]]}]],
nested,
Sort[
processPosList[{openposlist,closeposlist}],
Length[#2[[1]]]<Length[#1[[1]]]&
]
];
getAllUsedTags[text_String]:=
Module[{htmlTagsposlist,result,chars=Characters[text],x},
htmlTagsposlist=
StringPosition[
text,ShortestMatch["<"~~x__~~Whitespace|">"],Overlaps->True
];
result=
Union[
ToLowerCase/@
Apply[
StringJoin,
(Take[chars,{#1[[1]],#1[[2]]-1}]&)/@htmlTagsposlist,
{1}
] /. {"<br/" :> "<br"}
]
];
refineTags[tags_List]:=
Module[{alphabet=Characters["abcdefghijklmnopqrstuvwxyz/"]},
DeleteCases[tags,x_/;!MemberQ[alphabet,StringTake[x,{2,2}]]]
];
getTagTitle[tag_String]:=
If[StringTake[tag,{2,2}]==="/",StringDrop[tag,{2}],tag];
getPairedTags[tags_List]:=
Reverse/@
Select[Reap[(Sow[#1,getTagTitle[#1]]&)/@tags,_,#2&][[2]],Length[#1]==2&];
makeTagReplaceRules[pairedtags_List,unpairedtags_List]:=
Sort[
Join[
Apply[Rule,({#1,{StringDrop[#1,1],"Unpaired","Open"}}&)/@unpairedtags,{1}],
Apply[
Rule,
Flatten[
(Transpose[{#1,{{StringDrop[#1[[1]],1],"Open"},{StringDrop[#1[[2]],2],"Close"}}}]&)/@
pairedtags,
1
],
{1}
],
{">" -> {">","UnpairedClose"}}
],
StringLength[#1[[2,1]]] > StringLength[#2[[2,1]]]&
];
getTagNames[pairedtags_List,unpairedtags_List]:=
(StringDrop[#1,1]&)/@Join[Transpose[pairedtags][[1]],unpairedtags];
makeTagHashRules[tagnames_List]:=
Dispatch[MapThread[Rule,({#1,Range[Length[#1]]}&)[tagnames]]];
tagSplit[text_String,{tagrules__Rule}]:=
DeleteCases[
StringSplit[text,{tagrules}],x_/;StringMatchQ[x,Whitespace|""~~">"~~Whitespace|""]
];
splitText[text_String,pairedtags_List,unpairedtags_List]:=
tagSplit[text,makeTagReplaceRules[pairedtags,unpairedtags]];
preparse[text_]:=
Module[{step1},
With[{pos=Position[text,{_,"Open"}|{_,_,"Open"},\[Infinity]]},
step1=
ReplacePart[
text,
HTMLContainer["attr"]/@Extract[text,pos+1],
pos+1,
List/@Range[Length[pos]]
]
]
];
openCloseEnumerate[splittext_List,pairedtags_List,unpairedtags_List]:=
Module[{tagnames,taghashrules,tagtitlecounters,unpairedstack={},temptag},
tagnames=getTagNames[pairedtags,unpairedtags];
taghashrules=makeTagHashRules[tagnames];
tagtitlecounters=Table[0,{Length[tagnames]}];
(
Switch[
#1,
{x_,"Open"},
{#1[[1]],"Open",++tagtitlecounters[[#1[[1]]/.taghashrules]]},
{x_,"Close"},
{#1[[1]],"Close",tagtitlecounters[[#1[[1]]/.taghashrules]]--},
{x_,"Unpaired","Open"},
AppendTo[unpairedstack,#1];
{#1[[1]],"Open",++tagtitlecounters[[#1[[1]]/.taghashrules]]},
{"/>"|">","UnpairedClose"},
If[
Length[unpairedstack]>0
,
temptag=unpairedstack[[-1]];
unpairedstack=Most[unpairedstack];
{temptag[[1]],"Close",tagtitlecounters[[temptag[[1]]/.taghashrules]]--}
,
(*else*)
#1
],
_,
#1
]&
)/@
splittext
];
getOpenCloseForm[text_String,pairedtags_List,unpairedtags_List]:=
openCloseEnumerate[
preparse[splitText[text,pairedtags,unpairedtags]],pairedtags,unpairedtags
];
makeTagDepthList[opencloseform_List,pairedtags_List,unpairedtags_List]:=
DeleteCases[
({#1,Max[Cases[opencloseform,{#1,"Open"|"Close",x_Integer}:>x]]}&)/@
getTagNames[pairedtags,unpairedtags],
{x_,-\[Infinity]}
];
oneStepParse[parsed_,depth_Integer,tag_String,head_]:=
Module[{plist=(Position[parsed,{tag,#1,depth},\[Infinity]]&)/@{"Open","Close"}},
groupElementsNested[parsed,plist,head]
];
tagProcess[parseme_,{tag_String,maxdepth_Integer}]:=
Module[{hd=HTMLContainer[tag],result},
With[{hd1=hd,ourtag=tag},
hd1[{ourtag,"Open",n_},x__,{ourtag,"Close",n_}]:=
hd1[x];
result=Fold[oneStepParse[#1,#2,ourtag,hd1]&,parseme,Range[maxdepth,1,-1]]
];
Clear[hd];
result
];
openCloseProcess[opencloseform_List,pairedtags_List,unpairedtags_List]:=
Fold[tagProcess,opencloseform,makeTagDepthList[opencloseform,pairedtags,unpairedtags]];
documentParse[text_String,pairedtags_List,unpairedtags_List]:=
openCloseProcess[
getOpenCloseForm[text,pairedtags,unpairedtags],pairedtags,unpairedtags
];
refineParsed[parsed_]:=
(If[#1==={},#1,First[#1]]&)[Cases[parsed,HTMLContainer["html"][___]]];
parseText[text_String]:=
Module[{tags,paired,unpaired,parsed},
tags=refineTags[getAllUsedTags[text]];
paired=getPairedTags[tags];
unpaired=Complement[tags,Flatten[paired]];
parsed=
refineParsed[documentParse[text,paired,unpaired]]/.{">","UnpairedClose"}:>">";
{parsed,paired,unpaired}
];
removeLeaves[parsed_]:=
DeleteCases[parsed,_,{-1}];
postProcess[parsed_]:=
DeleteCases[parsed,">"|"",\[Infinity]];
Options[ParseHTML] = {
"PostProcess" -> False,
"RemoveLeaves" -> False
};
ParseHTML[html_String, opts:OptionsPattern[]]:=
With[{
pp = If[TrueQ["PostProcess"], postProcess, Identity],
rl = If[TrueQ["RemoveLeaves"], removeLeaves, Identity]
},
pp @ rl @ parseText @ html
];
End[]
EndPackage[]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment