Skip to content

Instantly share code, notes, and snippets.

@owulveryck
Created October 7, 2021 12:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save owulveryck/5f9a07762ce40e6f6d9028e76bd798e2 to your computer and use it in GitHub Desktop.
Save owulveryck/5f9a07762ce40e6f6d9028e76bd798e2 to your computer and use it in GitHub Desktop.
Support file for the rePocketable article
package main
import (
"bytes"
"io"
"log"
"os"
"golang.org/x/net/html"
)
var sample = `<figure class="ja jb jc jd je jf cw cx paragraph-image"> <div role="button" tabindex="0" class="jg jh ji jj aj jk"> <div class="cw cx iz"> <div class="jq s ji jr"> <div class="js jt s"> <div class="jl jm t u v jn aj at jo jp"> <img alt="" class="t u v jn aj ju jv jw" src="https://miro.medium.com/max/60/1*RSH2vh_xgQtjB68Zb7oBaA.jpeg?q=20" width="700" height="590" role="presentation" /> </div> <img alt="" class="jl jm t u v jn aj c" width="700" height="590" role="presentation" /><noscript><img alt="" class="t u v jn aj" src="https://miro.medium.com/max/1400/1*RSH2vh_xgQtjB68Zb7oBaA.jpeg" width="700" height="590" srcSet="https://miro.medium.com/max/552/1*RSH2vh_xgQtjB68Zb7oBaA.jpeg 276w, https://miro.medium.com/max/1104/1*RSH2vh_xgQtjB68Zb7oBaA.jpeg 552w, https://miro.medium.com/max/1280/1*RSH2vh_xgQtjB68Zb7oBaA.jpeg 640w, https://miro.medium.com/max/1400/1*RSH2vh_xgQtjB68Zb7oBaA.jpeg 700w" sizes="700px" role="presentation" /></noscript> </div> </div> </div> </div> </figure>`
func Example() {
content := bytes.NewBufferString(sample)
n, err := html.Parse(content)
if err != nil {
log.Fatal(err)
}
err = preProcess(n)
if err != nil {
log.Fatal(err)
}
html.Render(os.Stdout, n)
// output:
// <html><head></head><body><figure class="ja jb jc jd je jf cw cx paragraph-image"> <div role="button" tabindex="0" class="jg jh ji jj aj jk"> <div class="cw cx iz"> <div class="jq s ji jr"> <div class="js jt s"> <div class="jl jm t u v jn aj at jo jp"> </div> <img alt="" class="t u v jn aj" src="https://miro.medium.com/max/1400/1*RSH2vh_xgQtjB68Zb7oBaA.jpeg" width="700" height="590" srcset="https://miro.medium.com/max/552/1*RSH2vh_xgQtjB68Zb7oBaA.jpeg 276w, https://miro.medium.com/max/1104/1*RSH2vh_xgQtjB68Zb7oBaA.jpeg 552w, https://miro.medium.com/max/1280/1*RSH2vh_xgQtjB68Zb7oBaA.jpeg 640w, https://miro.medium.com/max/1400/1*RSH2vh_xgQtjB68Zb7oBaA.jpeg 700w" sizes="700px" role="presentation"/><noscript><img alt="" class="t u v jn aj" src="https://miro.medium.com/max/1400/1*RSH2vh_xgQtjB68Zb7oBaA.jpeg" width="700" height="590" srcSet="https://miro.medium.com/max/552/1*RSH2vh_xgQtjB68Zb7oBaA.jpeg 276w, https://miro.medium.com/max/1104/1*RSH2vh_xgQtjB68Zb7oBaA.jpeg 552w, https://miro.medium.com/max/1280/1*RSH2vh_xgQtjB68Zb7oBaA.jpeg 640w, https://miro.medium.com/max/1400/1*RSH2vh_xgQtjB68Zb7oBaA.jpeg 700w" sizes="700px" role="presentation" /></noscript> </div> </div> </div> </div> </figure></body></html>
}
func preProcess(n *html.Node) error {
if n.Type == html.ElementNode && n.Data == "figure" {
f := &figure{
images: make([]*html.Node, 0),
}
f.processFigure(n)
// Clear all other images (medium, towarddatascience, ...)
if len(f.images) > 1 {
for _, img := range f.images {
if img != f.validImage {
img.Parent.RemoveChild(img)
}
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
err := preProcess(c)
if err != nil {
return err
}
}
return nil
}
type figure struct {
images []*html.Node
validImage *html.Node
}
func (f *figure) processFigure(n *html.Node) error {
if n.Type == html.ElementNode && n.Data == "img" {
f.images = append(f.images, n)
}
if n.Data == "noscript" {
if originalImg := n.PrevSibling; originalImg.Data == "img" {
// the img data is encoded as a string in the n.FirstChild.Data field
// Let's parse it as a node:
doc, err := html.Parse(bytes.NewBufferString(n.FirstChild.Data))
if err != nil {
return err
}
img := getImgNode(doc)
if img != nil {
originalImg.Attr = img.Attr
}
f.validImage = originalImg
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
err := f.processFigure(c)
if err != io.EOF {
return err
}
}
return io.EOF
}
func getImgNode(node *html.Node) *html.Node {
if node.Type == html.ElementNode && node.Data == "img" {
return node
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
n := getImgNode(child)
if n != nil {
return n
}
}
return nil
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment