Skip to content

Instantly share code, notes, and snippets.

@ihavenonickname
Created August 10, 2018 17:21
Show Gist options
  • Save ihavenonickname/3adcdc2be4b9833deb148674770f6e4b to your computer and use it in GitHub Desktop.
Save ihavenonickname/3adcdc2be4b9833deb148674770f6e4b to your computer and use it in GitHub Desktop.
import grequests
import json
import unicodedata
from bs4 import BeautifulSoup
def download(urls):
def catcher(req, ex):
print(ex)
reqs = map(grequests.get, urls)
yield from grequests.imap(reqs, size=8, exception_handler=catcher)
def normalize(text):
nfkd_form = unicodedata.normalize('NFKD', text.strip())
only_ascii = nfkd_form.encode('ASCII', 'ignore')
return only_ascii.lower().decode()
def extract_posts_urls(html):
soup = BeautifulSoup(html, 'html.parser')
for a in soup.select('div[id="mostra_listagem"] > ul > li > a'):
if a.has_attr('href'):
yield a['href']
def extract_post_content(html):
soup = BeautifulSoup(html, 'html.parser')
content = soup.select('div[class="noticia-cnt"]')[0].text
return normalize(content)
def main():
posts = []
i = 0
for res in download(f'https://www.cifraclubnews.com.br/noticias/page/{i}/' for i in range(1, 1001)):
i += 1
for res in download(extract_posts_urls(res.text)):
content = extract_post_content(res.text)
if len(content) >= 500:
posts.append(content)
print('{:.1f}% done, {} posts extracted'.format(i / 10, len(posts)))
with open('musica.json', 'w') as f:
json.dump(posts, f, indent=4)
input('100% done! Press [ENTER] to close')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment