Last active
December 11, 2023 16:39
-
-
Save lezsakdomi/2b755011f070038884e960c44cc80cc9 to your computer and use it in GitHub Desktop.
Download Spotfire docs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
set -e | |
wget --recursive 'https://docs.tibco.com/pub/doc_remote/sfire_dev/area/doc/api/TIB_sfire-analyst_api/' \ | |
--header='authority: docs.tibco.com' \ | |
--header='accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7' \ | |
--header='accept-language: en-US,en;q=0.9,hu-HU;q=0.8,hu;q=0.7,de-AT;q=0.6,de;q=0.5' \ | |
--header='cache-control: max-age=0' \ | |
--header='sec-ch-ua: "Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"' \ | |
--header='sec-ch-ua-mobile: ?0' \ | |
--header='sec-ch-ua-platform: "Linux"' \ | |
--header='sec-fetch-dest: document' \ | |
--header='sec-fetch-mode: navigate' \ | |
--header='sec-fetch-site: none' \ | |
--header='sec-fetch-user: ?1' \ | |
--header='upgrade-insecure-requests: 1' \ | |
--header='user-agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' | |
i=0 n=$(ls docs.tibco.com/pub/doc_remote/sfire_dev/area/doc/api/TIB_sfire-analyst_api/html/*.htm | wc -l); for f in docs.tibco.com/pub/doc_remote/sfire_dev/area/doc/api/TIB_sfire-analyst_api/html/*.htm; do let i+=1; echo "[$i/$n] $f" >&2; echo; echo; echo -n '# '; cat "$f" | pup 'title text{}'; echo; pandoc --from html-native_divs-native_spans "$f" --lua-filter=filter.lua --to markdown; done >docs.md |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function Header(elem) | |
elem.identifier = "" | |
elem.classes = {} | |
elem.attributes = {} | |
return elem | |
end | |
function CodeBlock(block) | |
block.attributes["xml:space"] = nil | |
return block | |
end | |
function Link(el) | |
-- Remove specific link with its content | |
if el.target:match("^#") and el.title == "Copy" then | |
return pandoc.Str("") | |
else | |
-- Flatten the link content to plain text, removing spans, scripts, and other elements | |
local plain_text = pandoc.utils.stringify(el) | |
return pandoc.Str(plain_text) | |
end | |
end | |
function RawBlock(el) | |
if el.format == "html" and el.text:match("<script") then | |
-- Return an empty block to remove the script tag | |
return pandoc.Plain({}) | |
end | |
end | |
function RawInline(el) | |
if el.format == "html" and el.text:match("<script") then | |
-- Return an empty inline element to remove the script tag | |
return pandoc.Str("") | |
end | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Not recommended running this, as it probably puts high load on Tibco's servers