Skip to content

Instantly share code, notes, and snippets.

@mahmudahsan
Created February 25, 2018 09:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mahmudahsan/9a185df65e4b2cc49b28afcbb53c2a20 to your computer and use it in GitHub Desktop.
Save mahmudahsan/9a185df65e4b2cc49b28afcbb53c2a20 to your computer and use it in GitHub Desktop.
def parse_soup_to_simple_html(self):
news_list = self.__soup.find_all(['h1', 'h2']) # h1
#print (news_list)
htmltext = '''
<html>
<head><title>Simple News Link Scrapper</title></head>
<body>
{NEWS_LINKS}
</body>
</html>
'''
news_links = '<ol>'
for tag in news_list:
if tag.parent.get('href'):
# print (self.__url + tag.parent.get('href'), tag.string)
link = self.__url + tag.parent.get('href')
title = tag.string
news_links += "<li><a href='{}' target='_blank'>{}</a></li>\n".format(link, title)
news_links += '</ol>'
htmltext = htmltext.format(NEWS_LINKS=news_links)
# print(htmltext)
self.write_webpage_as_html(filepath="html/simplenews.html", data=htmltext.encode())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment