Skip to content

Instantly share code, notes, and snippets.

@dtoma
Created January 8, 2020 13:12
Show Gist options
  • Save dtoma/13feb50aa8c803a4f0e0dff2493d4856 to your computer and use it in GitHub Desktop.
Save dtoma/13feb50aa8c803a4f0e0dff2493d4856 to your computer and use it in GitHub Desktop.
import time
import requests
from bs4 import BeautifulSoup
latex_template = r"""\documentclass[imperial, twoside, 12pt]{octavo}
\begin{document}
\centering{\large %s }
%s
\end{document}
"""
def main():
root = 'http://www.paulgraham.com/'
# html = requests.get('http://www.paulgraham.com/articles.html').content
with open('articles.html', 'r') as f:
html = f.read()
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a')
few_links = [(root + l.get('href'), l.get('href')) for l in links[1:2]]
for link in few_links:
time.sleep(1)
url = link[0]
path = './articles/' + link[1]
print('save', url, 'to', path)
doc = requests.get(url).content
as_soup = BeautifulSoup(doc, 'html.parser')
with open(path, 'w+') as f:
# print title
f.write(latex_template % (as_soup.title.string, as_soup.body.get_text()))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment