Skip to content

Instantly share code, notes, and snippets.

@elisong
Created January 5, 2021 12:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save elisong/6964a75e466282fc1164cba1a950b478 to your computer and use it in GitHub Desktop.
Save elisong/6964a75e466282fc1164cba1a950b478 to your computer and use it in GitHub Desktop.
<?xml version="1.0" encoding="utf-8" standalone="yes" ?>
<rss version="2.0"
xmlns:atom="http://www.w3.org/2005/Atom">
<channel>
<title>RStudio Blog</title>
<link>https://blog.rstudio.com/</link>
<description>Recent content on RStudio Blog</description>
<generator>Hugo -- gohugo.io</generator>
<language>en-us</language>
<managingEditor>info@rstudio.com (RStudio, Inc.)</managingEditor>
<webMaster>info@rstudio.com (RStudio, Inc.)</webMaster>
<lastBuildDate>Wed, 23 Dec 2020 00:00:00 +0000</lastBuildDate>
<atom:link href="https://blog.rstudio.com/" rel="self" type="application/rss+xml" />
<item>
<title>Exploring US COVID-19 Cases and Deaths</title>
<link>https://blog.rstudio.com/2020/12/23/exploring-us-covid-19-cases/</link>
<pubDate>Wed, 23 Dec 2020 00:00:00 +0000</pubDate>
<author>Art Steinmetz</author>
<guid>https://blog.rstudio.com/2020/12/23/exploring-us-covid-19-cases/</guid>
<description>
&lt;img src="pic12.png" /&gt;
&lt;a src="anchor12.png" /&gt;
&lt;script src="script12.js" /&gt;
</description>
</item>
<item>
<title>Winners of the 2020 RStudio Table Contest</title>
<link>https://blog.rstudio.com/2020/12/23/winners-of-the-2020-rstudio-table-contest/</link>
<pubDate>Wed, 23 Dec 2020 00:00:00 +0000</pubDate>
<author>Rich Iannone and Curtis Kephart</author>
<guid>https://blog.rstudio.com/2020/12/23/winners-of-the-2020-rstudio-table-contest/</guid>
<description>
&lt;img src="pic22.png" /&gt;
&lt;a src="anchor22.png" /&gt;
&lt;script src="script22.js" /&gt;
</description>
</item>
</channel>
</rss>
<?xml version='1.0' encoding='utf-8'?>
<rss xmlns:ns0="http://www.w3.org/2005/Atom" version="2.0">
<channel>
<title>RStudio Blog</title>
<link>https://blog.rstudio.com/</link>
<description>Recent content on RStudio Blog</description>
<generator>Hugo -- gohugo.io</generator>
<language>en-us</language>
<managingEditor>info@rstudio.com (RStudio, Inc.)</managingEditor>
<webMaster>info@rstudio.com (RStudio, Inc.)</webMaster>
<lastBuildDate>Wed, 23 Dec 2020 00:00:00 +0000</lastBuildDate>
<ns0:link href="https://blog.rstudio.com/" rel="self" type="application/rss+xml" />
<item>
<title>Exploring US COVID-19 Cases and Deaths</title>
<link>https://blog.rstudio.com/2020/12/23/exploring-us-covid-19-cases/</link>
<pubDate>Wed, 23 Dec 2020 00:00:00 +0000</pubDate>
<author>Art Steinmetz</author>
<guid>https://blog.rstudio.com/2020/12/23/exploring-us-covid-19-cases/</guid>
<description>
&lt;img src="https://blog.rstudio.com/2020/12/23/exploring-us-covid-19-cases/pic12.png" /&gt;
&lt;a src="https://blog.rstudio.com/2020/12/23/exploring-us-covid-19-cases/anchor12.png" /&gt;
&lt;script src="script12.js" /&gt;
</description>
</item>
<item>
<title>Winners of the 2020 RStudio Table Contest</title>
<link>https://blog.rstudio.com/2020/12/23/winners-of-the-2020-rstudio-table-contest/</link>
<pubDate>Wed, 23 Dec 2020 00:00:00 +0000</pubDate>
<author>Rich Iannone and Curtis Kephart</author>
<guid>https://blog.rstudio.com/2020/12/23/winners-of-the-2020-rstudio-table-contest/</guid>
<description>
&lt;img src="https://blog.rstudio.com/2020/12/23/winners-of-the-2020-rstudio-table-contest/pic22.png" /&gt;
&lt;a src="https://blog.rstudio.com/2020/12/23/winners-of-the-2020-rstudio-table-contest/anchor22.png" /&gt;
&lt;script src="script22.js" /&gt;
</description>
</item>
</channel>
</rss>
# Platform : Darwin-20.2.0-x86_64-i386-64bit
# Version : 3.7.4
import platform
import subprocess
import re
import defusedxml.cElementTree as ET
def main():
input_file = "rss-xml-url-sub-input.xml"
output_file = "rss-xml-url-sub-output.xml"
print("Before :", subprocess.check_output(
'grep -m 1 "img" ' + input_file, shell=True))
tree = ET.parse("rss-xml-url-sub-input.xml")
root = tree.getroot()
for item in root.iter("item"):
link_url = item.find("link").text
description = item.find("description")
description.text = re.sub(
# refer: https://regexr.com/3e9bv
r'(<(?:img|a)[^>]+(?:src|href)=[\"|\'])(?!https?:\/\/)([^\/].+?)([\"|\'])',
r'\1' + link_url + r'\2\3',
description.text
)
tree.write(output_file, encoding="utf-8", xml_declaration=True)
print("After :", subprocess.check_output(
'grep -m 1 "img" ' + output_file, shell=True))
if __name__ == "__main__":
print("Platform : ", platform.platform())
print("Version : ", platform.python_version())
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment