hXtreme/scrapper.ipynb

## scrapper.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              scrapper.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## scrapper.py
# This was originally a Jupyter Notebook so the code might look a bit weird, my condolences.

import requests
from bs4 import BeautifulSoup

MAX_PRV_LEN = 160

PARSER = 'html.parser'
HR_REPLACEMENT = '-----'

def normalize_txt(s:str):
    s = s.replace(chr(160), '')
    s = s.replace('\n\n', '\n')
    s = s.replace('\n', '\n\n')
    rep = '\n' + (chr(8212)*10) + '\n'
    s = s.replace(rep, '\n----------\n')
    rep = '\n' + (chr(8212)*11) + '\n'
    s = s.replace(rep, '\n----------\n')
    return s.strip()

  def preview(s:str, max_len=MAX_PRV_LEN):
    s = s.strip()
    prv_len = min(s.find('\n'), max_len)
    ext = '...' if max_len < s.find('\n') else ''
    return s[:prv_len].strip() + ext

  def get_chp(url):
    # Get the page and parse it into a BeautifulSoup
    page = requests.get(url=url).text
    soup = BeautifulSoup(page, PARSER)
    soup_prv = preview(soup.text)
    print(soup_prv)

    # Get a convinient tag that contains the chapter
    article_tag = soup.find(
            name='div',
            attrs={'class':'entry-content'}
         )

    # Replace hr tags by HR_REPLCEMENT string for the ease of
    #   post-processing
    for hr_tag in article_tag.find_all('hr'):
        hr_tag.replace_with(HR_REPLACEMENT)

    # Extract the chapter text and end notes
    article = article_tag.text
    main, _, rest = article.rpartition(r'<< | TOC | >>')
    notes = rest.rpartition(r'Related')[0].strip(' -\n')

    main_txt = main.strip()
    print(preview(main_txt))
    print(preview(notes))

    # Massage the text to get a well formatted output witout junk text.
    main_parts = main_txt.split(HR_REPLACEMENT, maxsplit=2)
    chp_txt = '# ' + (main_parts[-1].lstrip()).rstrip(' -')
    return (soup_prv, chp_txt, notes)

  # Test

# test_url = r'http://www.scarletmadness.org/2017/12/07/stos-chapter-17/'
# test_url = r'http://www.scarletmadness.org/2017/11/30/stos-chapter-16/'
test_url = r'http://www.scarletmadness.org/2017/12/14/stos-chapter-18/'

url = test_url

prv, chp, notes = get_chp(url)

print(normalize_txt(chp))
print(normalize_txt(notes))

# Save Result

import codecs

s = prv.find(' ')
s = prv.find(' ', s+1)
s = prv.find(' ', s+1)

# print(prv[:s])
FILE = './' + prv[:s] + '.md'
# FILE = './16.md'
chp = normalize_txt(chp)
notes = normalize_txt(notes)

with codecs.open(FILE, 'w', 'utf-8') as f:
    f.write(chp)
    f.write('\n\n')
    f.write(notes)
	# This was originally a Jupyter Notebook so the code might look a bit weird, my condolences.

	import requests
	from bs4 import BeautifulSoup

	MAX_PRV_LEN = 160

	PARSER = 'html.parser'
	HR_REPLACEMENT = '-----'

	def normalize_txt(s:str):
	s = s.replace(chr(160), '')
	s = s.replace('\n\n', '\n')
	s = s.replace('\n', '\n\n')
	rep = '\n' + (chr(8212)*10) + '\n'
	s = s.replace(rep, '\n----------\n')
	rep = '\n' + (chr(8212)*11) + '\n'
	s = s.replace(rep, '\n----------\n')
	return s.strip()

	def preview(s:str, max_len=MAX_PRV_LEN):
	s = s.strip()
	prv_len = min(s.find('\n'), max_len)
	ext = '...' if max_len < s.find('\n') else ''
	return s[:prv_len].strip() + ext

	def get_chp(url):
	# Get the page and parse it into a BeautifulSoup
	page = requests.get(url=url).text
	soup = BeautifulSoup(page, PARSER)
	soup_prv = preview(soup.text)
	print(soup_prv)

	# Get a convinient tag that contains the chapter
	article_tag = soup.find(
	name='div',
	attrs={'class':'entry-content'}
	)

	# Replace hr tags by HR_REPLCEMENT string for the ease of
	# post-processing
	for hr_tag in article_tag.find_all('hr'):
	hr_tag.replace_with(HR_REPLACEMENT)

	# Extract the chapter text and end notes
	article = article_tag.text
	main, _, rest = article.rpartition(r'<< \| TOC \| >>')
	notes = rest.rpartition(r'Related')[0].strip(' -\n')

	main_txt = main.strip()
	print(preview(main_txt))
	print(preview(notes))

	# Massage the text to get a well formatted output witout junk text.
	main_parts = main_txt.split(HR_REPLACEMENT, maxsplit=2)
	chp_txt = '# ' + (main_parts[-1].lstrip()).rstrip(' -')
	return (soup_prv, chp_txt, notes)

	# Test

	# test_url = r'http://www.scarletmadness.org/2017/12/07/stos-chapter-17/'
	# test_url = r'http://www.scarletmadness.org/2017/11/30/stos-chapter-16/'
	test_url = r'http://www.scarletmadness.org/2017/12/14/stos-chapter-18/'

	url = test_url

	prv, chp, notes = get_chp(url)

	print(normalize_txt(chp))
	print(normalize_txt(notes))

	# Save Result

	import codecs

	s = prv.find(' ')
	s = prv.find(' ', s+1)
	s = prv.find(' ', s+1)

	# print(prv[:s])
	FILE = './' + prv[:s] + '.md'
	# FILE = './16.md'
	chp = normalize_txt(chp)
	notes = normalize_txt(notes)

	with codecs.open(FILE, 'w', 'utf-8') as f:
	f.write(chp)
	f.write('\n\n')
	f.write(notes)