Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import urllib.request as url_m
from random import randint
from bs4 import BeautifulSoup
html_complete = 'https://en.wikipedia.org'
page_name = '/wiki/Johannes_Bilberg'
for i in range (1, 20): #choice of range arbitrary
#print(page_name)
webpage = url_m.urlopen(html_complete + page_name).read()
soup = BeautifulSoup(webpage , 'html.parser')
f = open("wiki" + str(i) + ".txt", 'w')
try:
title = soup.find('title')
f.write(title.getText() + '\n')
print(title.getText())
except:
title = "Unrecognized Title"
f.write(title + '\n')
print('...')
text = soup.findAll('p')
for lines in text:
#print(lines.getText().encode('utf-8'))
srtt = lines.getText().encode('utf-32')
strr = srtt.decode(encoding='utf-32')
try:
f.write(strr)
except:
print("Ignore unrecognized symbol")
f.write('**')
f.close()
#links = soup.findAll('a', href=True)
links = soup.select('p a[href]')
list_available_links = []
for link in links:
ref = link['href']
if 'wiki' in ref and 'Wikipedia' not in ref and 'File' not in ref and 'Help' not in ref and 'Talk' not in ref:
list_available_links.append(ref)
lucky_take = randint(0, len(list_available_links) - 1)
page_name = list_available_links[lucky_take]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment