Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import os
from bs4 import BeautifulSoup
hrefs = []
path = '/Volumes/UNTITLED/wimoveis/paginas/'
for fname in os.listdir(path):
print(fname)
if ('.html' in fname) and ('._' not in fname):
with open(path + fname, mode = 'r') as f:
html = f.read()
soup = BeautifulSoup(html)
h4 = soup.find_all('h4', class_ = 'aviso-data-title')
href = [e.find('a')['href'] for e in h4]
hrefs += href
print(len(hrefs))
df = pd.DataFrame(hrefs)
df.to_csv('hrefs.csv', index = False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment