Skip to content

Instantly share code, notes, and snippets.

@rexbannon
Created October 29, 2020 21:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save rexbannon/115f74c845f27b9fc14522cb13dc16c5 to your computer and use it in GitHub Desktop.
Save rexbannon/115f74c845f27b9fc14522cb13dc16c5 to your computer and use it in GitHub Desktop.
Ljodafundur
import requests
import re
import pprint
from bs4 import BeautifulSoup
import random
import html2text
import dominate
from dominate.tags import *
URL = 'http://www.ljod.is/index.php/ljod/poem_collection/author'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
list = [a['href'] for a in soup.find_all('a')]
author = []
for i in range(len(list)):
if re.search('view_poet', list[i]):
author.append(list[i])
print(author)
print(len(author))
tala = random.randrange(0, len(author))
newurl = author[tala]
newpage = requests.get(newurl)
newsoup = BeautifulSoup(newpage.content, 'html.parser')
newlist = [a['href'] for a in newsoup.find_all('a')]
ljod = []
for i in range(len(newlist)):
if re.search('view_poem', newlist[i]):
ljod.append(newlist[i])
print(ljod)
ljodatala = random.randrange(0, len(ljod))
ljoda_url = ljod[ljodatala]
ljoda_page = requests.get(ljoda_url)
ljoda_supa = BeautifulSoup(ljoda_page.content, 'html.parser')
ljodid = ljoda_supa.find_all('span', class_='poem')
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', str(raw_html))
return cleantext
print(cleanhtml(ljodid))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment