Skip to content

Instantly share code, notes, and snippets.

@AdrianoPereira
Created September 9, 2020 14:42
Show Gist options
  • Save AdrianoPereira/134fb66dc32278ac1f4988b603cda38e to your computer and use it in GitHub Desktop.
Save AdrianoPereira/134fb66dc32278ac1f4988b603cda38e to your computer and use it in GitHub Desktop.
import os
from bs4 import BeautifulSoup
import json
def get_filepaths(path):
return [os.path.join(path, talk) for talk in os.listdir(path)]
def extract_questions_talk(filepath):
talk_name = filepath.split('/')[-1].replace('.html', '')
with open(filepath, 'r') as file:
f = file.read()
cards = BeautifulSoup(f, "html.parser").find_all('div', {'class': 'card question-item'})
data_talk = list()
for card in cards:
row_data = dict()
row_data['name'] = card.find('div', {'class': 'question-item__author truncate'}).text
row_data['question'] = card.find('span', {'class': 'Linkify'}).text
row_data['upvotes'] = card.find('button', {'class': 'score__btn'}).attrs['aria-label']
data_talk.append(row_data)
return data_talk
def get_all_questions(folder, **kw):
filepaths = get_filepaths(folder)
talks_data = dict()
for path in filepaths:
talk_name = path.split('/')[-1].replace('.html', '')
with open(path) as file:
f = file.read()
cards = BeautifulSoup(f, "html.parser").find_all(
'div', {'class': 'card question-item'}
)
data = list()
for card in cards:
row_data = dict()
row_data['name'] = card.find(
'div', {'class': 'question-item__author truncate'}
).text
row_data['question'] = card.find(
'span', {'class': 'Linkify'}
).text
row_data['upvotes'] = card.find(
'button', {'class': 'score__btn'}
).attrs['aria-label']
data.append(row_data)
talks_data[talk_name] = data
save = kw.get('save', False)
filename = kw.get('filename', None)
if save:
if filename:
with open(filename, 'w') as fp:
json.dump(talks_data, fp)
else:
print('File name is not defined!')
return talks_data
if __name__ == "__main__":
# PATH = './pages'
PATH = './pages/Perguntas - Dra. Karine Reis.html'
data = extract_questions_talk(PATH)
print(data[0])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment