Skip to content

Instantly share code, notes, and snippets.

@bdmorin
Last active April 24, 2024 20:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bdmorin/8ef7a9e2082fa7bc90c9878a34b37a59 to your computer and use it in GitHub Desktop.
Save bdmorin/8ef7a9e2082fa7bc90c9878a34b37a59 to your computer and use it in GitHub Desktop.
from newspaper import Article
import markdownify
def extract_article(url):
article = Article(url)
article.download()
article.parse()
return article.text
def convert_to_markdown(html_content):
markdown_text = markdownify.markdownify(html_content, heading_style="ATX")
return markdown_text
def main():
url = "https://archive.is/dLn1D"
html_content = extract_article(url)
markdown_text = convert_to_markdown(html_content)
print(markdown_text)
if __name__ == "__main__":
main()
import html2text
import sys
# NOW WITH NO MARKDOWNIFY!
def custom_pre_handler(attrs, content):
classes = attrs.get('class', '').split()
if 'shell' in classes:
return f"```sh\n{content}\n```"
else:
return f"```\n{content}\n```"
def convert_html_to_markdown(file_path):
# Configure html2text
h = html2text.HTML2Text()
h.body_width = 0 # Sets no wrap
h.emphasis_mark = '*' # Use '*' for emphasis
h.strong_mark = '**' # Use '**' for strong
h.ul_item_mark = '-' # Use '-' for unordered lists
h.pre_processor = custom_pre_handler
# Read HTML file
with open(file_path, 'r') as file:
html_input = file.read()
# Convert HTML to Markdown
markdown_output = h.handle(html_input)
return markdown_output
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python script.py <file_path>")
sys.exit(1)
file_path = sys.argv[1]
markdown_output = convert_html_to_markdown(file_path)
print(markdown_output)

All products | Books to Scrape - Sandbox

Books to Scrape We love being scraped!

All products

1000 results - showing 1 to 20.

Warning! This is a demo website for web scraping purposes. Prices and ratings here were randomly assigned and have no real meaning.

  1. A Light in the Attic

£51.77

In stock

Add to basket 2. Tipping the Velvet

£53.74

In stock

Add to basket 3. Soumission

£50.10

In stock

Add to basket 4. Sharp Objects

£47.82

In stock

Add to basket 5. Sapiens: A Brief History of Humankind

£54.23

In stock

Add to basket 6. The Requiem Red

£22.65

In stock

Add to basket 7. The Dirty Little Secrets of Getting Your Dream Job

£33.34

In stock

Add to basket 8. The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull

£17.93

In stock

Add to basket 9. The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics

£22.60

In stock

Add to basket 10. The Black Maria

£52.15

In stock

Add to basket 11. Starving Hearts (Triangular Trade Trilogy, #1)

£13.99

In stock

Add to basket 12. Shakespeare's Sonnets

£20.66

In stock

Add to basket 13. Set Me Free

£17.46

In stock

Add to basket 14. Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)

£52.29

In stock

Add to basket 15. Rip it Up and Start Again

£35.02

In stock

Add to basket 16. Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991

£57.25

In stock

Add to basket 17. Olio

£23.88

In stock

Add to basket 18. Mesaerion: The Best Science Fiction Stories 1800-1849

£37.59

In stock

Add to basket 19. Libertarianism for Beginners

£51.33

In stock

Add to basket 20. It's Only the Himalayas

£45.17

In stock

Add to basket

import sys
import requests
from bs4 import BeautifulSoup
import markdownify
def html_to_markdown(url):
try:
response = requests.get(url)
response.raise_for_status()
html_content = response.text
except requests.RequestException as e:
print(f"Error fetching URL {url}: {str(e)}")
return None
markdown = markdownify.markdownify(html_content, heading_style="ATX")
return markdown
def save_markdown(markdown_text, file_path):
try:
with open(file_path, 'w') as file:
file.write(markdown_text)
print(f"Markdown saved to {file_path}")
except IOError as e:
print(f"Error writing to file {file_path}: {str(e)}")
def main():
if len(sys.argv) != 3:
print("Usage: python app.py <URL> <output_file>")
sys.exit(1)
url = sys.argv[1]
output_file = sys.argv[2]
markdown_text = html_to_markdown(url)
if markdown_text:
save_markdown(markdown_text, output_file)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment