Skip to content

Instantly share code, notes, and snippets.

@hXtreme
Created July 12, 2019 02:02
Show Gist options
  • Save hXtreme/588c465dc2caaf58282516de40e2567c to your computer and use it in GitHub Desktop.
Save hXtreme/588c465dc2caaf58282516de40e2567c to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# This was originally a Jupyter Notebook so the code might look a bit weird, my condolences.
import requests
from bs4 import BeautifulSoup
MAX_PRV_LEN = 160
PARSER = 'html.parser'
HR_REPLACEMENT = '-----'
def normalize_txt(s:str):
s = s.replace(chr(160), '')
s = s.replace('\n\n', '\n')
s = s.replace('\n', '\n\n')
rep = '\n' + (chr(8212)*10) + '\n'
s = s.replace(rep, '\n----------\n')
rep = '\n' + (chr(8212)*11) + '\n'
s = s.replace(rep, '\n----------\n')
return s.strip()
def preview(s:str, max_len=MAX_PRV_LEN):
s = s.strip()
prv_len = min(s.find('\n'), max_len)
ext = '...' if max_len < s.find('\n') else ''
return s[:prv_len].strip() + ext
def get_chp(url):
# Get the page and parse it into a BeautifulSoup
page = requests.get(url=url).text
soup = BeautifulSoup(page, PARSER)
soup_prv = preview(soup.text)
print(soup_prv)
# Get a convinient tag that contains the chapter
article_tag = soup.find(
name='div',
attrs={'class':'entry-content'}
)
# Replace hr tags by HR_REPLCEMENT string for the ease of
# post-processing
for hr_tag in article_tag.find_all('hr'):
hr_tag.replace_with(HR_REPLACEMENT)
# Extract the chapter text and end notes
article = article_tag.text
main, _, rest = article.rpartition(r'<< | TOC | >>')
notes = rest.rpartition(r'Related')[0].strip(' -\n')
main_txt = main.strip()
print(preview(main_txt))
print(preview(notes))
# Massage the text to get a well formatted output witout junk text.
main_parts = main_txt.split(HR_REPLACEMENT, maxsplit=2)
chp_txt = '# ' + (main_parts[-1].lstrip()).rstrip(' -')
return (soup_prv, chp_txt, notes)
# Test
# test_url = r'http://www.scarletmadness.org/2017/12/07/stos-chapter-17/'
# test_url = r'http://www.scarletmadness.org/2017/11/30/stos-chapter-16/'
test_url = r'http://www.scarletmadness.org/2017/12/14/stos-chapter-18/'
url = test_url
prv, chp, notes = get_chp(url)
print(normalize_txt(chp))
print(normalize_txt(notes))
# Save Result
import codecs
s = prv.find(' ')
s = prv.find(' ', s+1)
s = prv.find(' ', s+1)
# print(prv[:s])
FILE = './' + prv[:s] + '.md'
# FILE = './16.md'
chp = normalize_txt(chp)
notes = normalize_txt(notes)
with codecs.open(FILE, 'w', 'utf-8') as f:
f.write(chp)
f.write('\n\n')
f.write(notes)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment