Skip to content

Instantly share code, notes, and snippets.

@sebbacon
Created January 22, 2024 12:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sebbacon/180e652ad1063148b4eef5a1e60d856a to your computer and use it in GitHub Desktop.
Save sebbacon/180e652ad1063148b4eef5a1e60d856a to your computer and use it in GitHub Desktop.
Use console-based javascript to parse a complicated DOM; and playwright to run it against specific URLs
import subprocess
from PIL import Image
# This is the complex parsing. It locates every single italic character in the text,
# and then adds a custom inline style to it
js = """document.querySelectorAll('*').forEach(element => {
const fontStyle = window.getComputedStyle(element).fontStyle;
if (fontStyle === 'italic') {
element.style.backgroundColor = 'lightgrey';
} else {
element.style.backgroundColor = 'white';
}
});
function wrapQuotes(textNode) {
const parentStyle = window.getComputedStyle(textNode.parentNode);
if (parentStyle.fontStyle === 'italic') {
return; // Skip if parent is italic
}
const content = textNode.nodeValue;
const newContent = content.replace(/(["“”])/g, '<span class="blue-quote">$1</span>');
const tempDiv = document.createElement('div');
tempDiv.innerHTML = newContent;
while (tempDiv.firstChild) {
textNode.parentNode.insertBefore(tempDiv.firstChild, textNode);
}
textNode.parentNode.removeChild(textNode);
}
function walkTheDOM(node, func) {
let children = Array.from(node.childNodes);
children.forEach(child => {
func(child);
if (child.childNodes.length) {
walkTheDOM(child, func);
}
});
}
// Add CSS for blue background on quotes
const style = document.createElement('style');
document.head.appendChild(style);
style.sheet.insertRule('.blue-quote { background-color: yellow; }');
// Walk the DOM and apply changes
walkTheDOM(document.body, function (node) {
if (node.nodeType === 3) { // Node type 3 is a text node
wrapQuotes(node);
}
});
"""
# Now we run it from the command line against several URLs
urls = [
"https://beetsath.wixsite.com/velmadinkley",
"https://beetsath.wixsite.com/norvillerogers",
"https://beetsath.wixsite.com/daphneblake",
"https://beetsath.wixsite.com/scoobertdoo",
"https://beetsath.wixsite.com/fredjones",
]
files = []
for i, url in enumerate(urls):
fname = f"part_{i}.png"
files.append(fname)
cmd = [
"shot-scraper",
"-o",
fname,
url,
"-s",
"#Containerc1dmp",
"--javascript",
js,
]
subprocess.run(cmd)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment