Created
January 22, 2024 12:53
-
-
Save sebbacon/180e652ad1063148b4eef5a1e60d856a to your computer and use it in GitHub Desktop.
Use console-based javascript to parse a complicated DOM; and playwright to run it against specific URLs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import subprocess | |
from PIL import Image | |
# This is the complex parsing. It locates every single italic character in the text, | |
# and then adds a custom inline style to it | |
js = """document.querySelectorAll('*').forEach(element => { | |
const fontStyle = window.getComputedStyle(element).fontStyle; | |
if (fontStyle === 'italic') { | |
element.style.backgroundColor = 'lightgrey'; | |
} else { | |
element.style.backgroundColor = 'white'; | |
} | |
}); | |
function wrapQuotes(textNode) { | |
const parentStyle = window.getComputedStyle(textNode.parentNode); | |
if (parentStyle.fontStyle === 'italic') { | |
return; // Skip if parent is italic | |
} | |
const content = textNode.nodeValue; | |
const newContent = content.replace(/(["“”])/g, '<span class="blue-quote">$1</span>'); | |
const tempDiv = document.createElement('div'); | |
tempDiv.innerHTML = newContent; | |
while (tempDiv.firstChild) { | |
textNode.parentNode.insertBefore(tempDiv.firstChild, textNode); | |
} | |
textNode.parentNode.removeChild(textNode); | |
} | |
function walkTheDOM(node, func) { | |
let children = Array.from(node.childNodes); | |
children.forEach(child => { | |
func(child); | |
if (child.childNodes.length) { | |
walkTheDOM(child, func); | |
} | |
}); | |
} | |
// Add CSS for blue background on quotes | |
const style = document.createElement('style'); | |
document.head.appendChild(style); | |
style.sheet.insertRule('.blue-quote { background-color: yellow; }'); | |
// Walk the DOM and apply changes | |
walkTheDOM(document.body, function (node) { | |
if (node.nodeType === 3) { // Node type 3 is a text node | |
wrapQuotes(node); | |
} | |
}); | |
""" | |
# Now we run it from the command line against several URLs | |
urls = [ | |
"https://beetsath.wixsite.com/velmadinkley", | |
"https://beetsath.wixsite.com/norvillerogers", | |
"https://beetsath.wixsite.com/daphneblake", | |
"https://beetsath.wixsite.com/scoobertdoo", | |
"https://beetsath.wixsite.com/fredjones", | |
] | |
files = [] | |
for i, url in enumerate(urls): | |
fname = f"part_{i}.png" | |
files.append(fname) | |
cmd = [ | |
"shot-scraper", | |
"-o", | |
fname, | |
url, | |
"-s", | |
"#Containerc1dmp", | |
"--javascript", | |
js, | |
] | |
subprocess.run(cmd) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment