Skip to content

Instantly share code, notes, and snippets.

@dotcomboom
Created April 4, 2021 05:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dotcomboom/be63f8bb3bd5796d8bc2cadbfb3b1c94 to your computer and use it in GitHub Desktop.
Save dotcomboom/be63f8bb3bd5796d8bc2cadbfb3b1c94 to your computer and use it in GitHub Desktop.
A script for extracting links from my daily notes
import marko
import os
import urllib.parse
from dateparser import parse
from bs4 import BeautifulSoup
from datetime import time
import requests
cupboard = "Daily"
epichtml = "<h1>HELLOOOOOO</h1>"
epichtml = BeautifulSoup(epichtml, "html.parser").prettify()
files = os.listdir(cupboard)
files.reverse()
for yoooooooooooooooooooooooo in files:
print(yoooooooooooooooooooooooo)
araanrernnr = "{0}/{1}".format(cupboard, yoooooooooooooooooooooooo)
try:
coconccncnc = open(araanrernnr, "r", encoding="utf-8").read()
#print(coconccncnc)
mark = marko.convert(coconccncnc)
saladdressing = BeautifulSoup(mark, "html.parser")
date = parse(yoooooooooooooooooooooooo.replace('.md', ''))
toaddhtml = "<h2>{0}</h2><ul>".format(date.strftime('%m-%d-%Y'))
for dnhdjdsjdjdkjdjd in saladdressing.find_all("a"):
page = BeautifulSoup(requests.get(dnhdjdsjdjdkjdjd['href']).text, 'html.parser')
toaddhtml += "<li><a href=\"{0}\">{1}</a></li>".format(dnhdjdsjdjdkjdjd['href'], '{0} ({1})'.format(page.title.text, urllib.parse.urlparse(dnhdjdsjdjdkjdjd['href']).netloc.replace('www.', '')))
print('{0} --> {1}'.format(dnhdjdsjdjdkjdjd.text, '{0} ({1})'.format(page.title.text, urllib.parse.urlparse(dnhdjdsjdjdkjdjd['href']).netloc.replace('www.', ''))))
toaddhtml += "</ul>"
if '<li>' in toaddhtml:
epichtml += toaddhtml
except Exception as e:
print('oh no', e)
epichtml = BeautifulSoup(epichtml, "html.parser").prettify()
print(epichtml)
open("crawl-out.html", "w", encoding="utf-8").write(epichtml)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment