Skip to content

Instantly share code, notes, and snippets.

@rgchris
Last active December 7, 2022 17:51
Show Gist options
  • Save rgchris/8172326 to your computer and use it in GitHub Desktop.
Save rgchris/8172326 to your computer and use it in GitHub Desktop.
An exercise in scraping Blogger HTML content with Rebol.
REBOL [
Title: "Scraper"
Date: 1-Dec-2013
]
textize: func [html [string!] /local rule last-list value new-table in-pre][
html: load-html/dom html
last-node: html
last-list: ""
in-pre: false
html: join "" collect [
rule: [
switch node/name [
<li> [keep rejoin ["^/" last-list " "]]
<ul> [append last-list "*"]
<ol> [append last-list "#"]
<h1> [keep "^/=== "]
<h2> [keep "^/=== "]
<h3> [keep "^/--- "]
<h4> [keep "^/... "]
<pre> <code> [keep "^/^/^-" in-pre: true]
; <div> <span>
<b> <strong> [keep "=b "]
<i> <em> [keep "=i "]
<u> [keep "=u "]
<blockquote> [] ; [keep "^/\blockquote^/^/"]
<table> [keep "^/\table^/^/" new-table: true]
<tr> [either new-table [new-table: false][keep "^/^/=row^/^/"]]
]
case [
any [
node/name = %.txt
string? node/value
][
keep either in-pre [
replace/all node/value "^/" "^/^-"
][
replace/all node/value "=" "\="
]
]
node/name = <head> []
node/name = <br> [keep either in-pre ["^/^-"]["^/"]]
node/name = <a> [
keep "["
case [
block? node/value [foreach node node/children :rule]
string? node/value [keep node/value]
]
keep rejoin ["](" node/get #href ")"]
]
node/name = <img> [
keep join "=" remold new-line/all [
'image
any [as url! node/get #src as file! node/get #src]
any [node/get #alt "Image"]
] false
]
node/name = <form> [keep "^/^/=donate^/^/"]
node/name = <iframe> [
keep join "^/^/" node/get #src
either node/get #width [
keep join " " node/get #width
if node/get #height [
keep join "x" node/get #height
]
][
if node/get #height [
keep join " -1x" node/get #height
]
]
]
all [
node/name = <div>
find ["pbar-o"] node/get #class
][
value: node/get <div>
value: value/get #style
parse value [thru "width:" copy value to "%" to end]
keep rejoin ["=[progress " trim value "]"]
]
true [
; don't need the /clone function, so reuse to temporarily store style
if node/clone: switch node/get #style [
"color: red;"
"color: rgb(204, 0, 0); "
"color: #990000;"
"color: #cc0000;"
{color: #cc0000; font-family: 'Trebuchet MS', Trebuchet, sans-serif; line-height: 18px;} [<red>]
; "color: #a32f2f;"
; "color: #a32f2f; text-decoration: none;"
; "color: #a32f2f;" [<darkred>]
"font-family: inherit; font-size: large;"
"font-size: large; line-height: 18px;"
"font-size: large; "
"font-size: large;"
{font-family: 'Trebuchet MS', Trebuchet, sans-serif; font-size: large;}
{font-family: 'Trebuchet MS', sans-serif; font-size: large;} [keep "^/--- " none]
"font-size: x-large;" [keep "^/=== " none]
"color: #6aa84f;"
{color: #38761d; font-family: Trebuchet MS, sans-serif;}
"color: #38761d;" ; [<darkgreen>]
{color: #6aa84f; font-family: Trebuchet MS, sans-serif; font-weight: normal;}
{color: #6aa84f; font-family: 'Trebuchet MS', sans-serif;}
{color: #6aa84f; font-family: Trebuchet MS, sans-serif;} ; [<green>]
{color: #6aa84f; font-family: Courier New, Courier, monospace;}
{color: #38761d; font-family: 'Courier New', Courier, monospace;}
{color: #38761d; font-family: Courier New, Courier, monospace;} ; [<monogreen>]
{font-family: 'Courier New', Courier, monospace; white-space: pre;}
"font-family: 'Courier New', Courier, monospace;"
"font-family: monospace;"
"font-family: Courier New, Courier, monospace;" [<code>]
; "width: 145px;" [<145px>]
; "font-family: inherit;" [<inherit>]
; "text-align: center;"
; "margin-left: auto; margin-right: auto;"
; {margin-left: auto; margin-right: auto; text-align: center;}
; "clear: both; text-align: center;" [<center>]
; "margin-left:40px"
; "margin-left: 1em; margin-right: 1em;" [<indent>]
; "text-align: left;" [<left>]
; "white-space: pre;"
; "white-space:pre"
; "white-space: pre; " [<ipre>]
; {line-height: 1.4; list-style-image: initial; list-style-position: initial; list-style-type: disc; margin-bottom: 0.5em; margin-left: 0px; margin-right: 0px; margin-top: 0.5em; padding-bottom: 0px; padding-left: 2.5em; padding-right: 2.5em; padding-top: 0px;} [<disc>]
; {border-bottom-style: none; border-color: initial; border-left-style: none; border-right-style: none; border-top-color: initial; border-top-style: none; border-width: initial; margin-bottom: 0.25em; margin-left: 0px; margin-right: 0px; margin-top: 0px; padding-bottom: 0.25em; padding-left: 0px; padding-right: 0px; padding-top: 0.25em; text-indent: 0px;} [<no-border>]
; "color: #eeeeee;"
; {color: #eeeeee; font-family: 'Trebuchet MS', Trebuchet, sans-serif;} [<grey>]
; "line-height: 18px;" [<18px>]
; "border:1px solid #333333;border-bottom-style:none"
; {border-bottom-style: none; border: 1px solid #333333;} [<bordered>]
; "font-family: 'Trebuchet MS', sans-serif;"
; "font-family: Trebuchet MS, sans-serif;"
; {font-family: 'Trebuchet MS', Trebuchet, sans-serif; line-height: 18px;}
; {font-family: 'Trebuchet MS', Trebuchet, sans-serif;} [<chet>]
; "font-size: small;" [<small>]
][
keep node/clone
]
; probe node/name
if block? node/value [ ; recursive walk through this node's kids.
foreach node node/children :rule
]
if node/clone [keep back insert copy node/clone "/"]
]
]
switch/default node/name [
<div> <li> <h1> <h2> <h3> <h4> <td> <th> [keep "^/^/"]
<ul> <ol> [remove back tail last-list]
<pre> <code> [keep "^/^/" in-pre: false]
<blockquote> [] ; [keep "^/^//blockquote^/^/"]
<table> [keep "^//table^/^/"]
; <div> <span>
<b> <strong> [keep "=b."]
<i> <em> [keep "=i."]
<u> [keep "=u."]
<head> ; [probe node/flatten]
<form> <input>
<title> <body> <iframe> <div> <img> <tbody> <tr> <br> <a> <span> %.txt []
][
; Unsupported Tags?
probe node/name
]
]
foreach node html/children :rule
]
trim/head/tail foreach [old new][
"<red>^/</red>" "^/"
"^/--- ^/" ""
"^/* ^/" ""
"![]" "![Image]"
"=b ^/=b." "^/^/"
"=i ^/=i." "^/^/"
" =b." "=b. "
"<code></code>" ""
"<code> </code>" " "
"<code>" "=r "
"</code>" "=r."
" =r." "=r. "
"=r ^/=r." "^/^/"
#{C2A0} " "
"^/: " "^/"
"=b <red>R</red>=b." "<red>R</red>"
"=b <red>R</red>e=b." "<red>R</red>e"
"<red>=b R=b.</red>" "<red>R</red>"
"<red>R</red>" "=[R]"
"^/###" "^/#>>"
"^/##" "^/#>"
"^/^-^/" "^/^/"
"^/^- ^/" "^/^/"
"^/^/^/^/" "^/^/"
"^/^/^/" "^/^/"
"^/^/^/" "^/^/"
"^/^/^/" "^/^/"
][
replace/all html old new
]
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment