Skip to content

Instantly share code, notes, and snippets.

@kizernis
Last active December 4, 2018 03:01
Show Gist options
  • Save kizernis/c3ec556f96e43352a7e65765ceaf4b57 to your computer and use it in GitHub Desktop.
Save kizernis/c3ec556f96e43352a7e65765ceaf4b57 to your computer and use it in GitHub Desktop.
import requests
from bs4 import BeautifulSoup
import re
with open('input.txt', encoding='utf-8') as f:
keywords = f.read().splitlines()
pattern = re.compile('|'.join(keywords), flags=re.IGNORECASE)
xml = requests.get('https://www.kill-the-newsletter.com/feeds/axl40j979d8l9a8f07q2.xml').text
soup_common = BeautifulSoup(xml, 'lxml')
blocks_count_total = 0
blocks_count_matched = 0
# f = open('output.txt', 'w', newline='\n', encoding='utf-8')
for soup_content in soup_common.find_all('content'):
soup = BeautifulSoup(soup_content.get_text(), 'lxml')
for soup_br in soup.find_all('br'):
soup_br.replace_with('\n')
text = soup.get_text().split('****************************', 1)[1]
text = re.sub(r'[\r\n]{3,}', '\n\n', text)
matches = re.findall(r'^(\d+\)\s+)(.+?)([\s\r\n]+Back to Top Back to Category Index)$', text, flags=re.MULTILINE|re.DOTALL)
blocks_count_total += len(matches)
for match in matches:
if re.search(pattern, match[1]):
blocks_count_matched += 1
print(match[1] + '\n\n\n')
# f.write(match[1] + '\n\n\n')
summary_string = f'Total blocks: {blocks_count_total}, matched blocks: {blocks_count_matched}'
print(summary_string)
# f.write(summary_string)
# f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment