Skip to content

Instantly share code, notes, and snippets.

@abehmiel
Last active October 24, 2017 05:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save abehmiel/40216ded24eadbc9342e679a579cadd1 to your computer and use it in GitHub Desktop.
Save abehmiel/40216ded24eadbc9342e679a579cadd1 to your computer and use it in GitHub Desktop.
Buzzfeed article scraper for NLP
from bs4 import BeautifulSoup
import requests
# for cleaning:
import re
import string
import nltk
from itertools import chain
def scrape_buzzfeed_article(url):
"""
inputs: a valid buzzfeed article url
returns: a list of strings, which are the paragraphs
containing the text of the article
"""
this_page = requests.get(url)
soup = BeautifulSoup(this_page.content, 'lxml')
article = []
for p in soup.find_all(class_='subbuzz-text'):
for q in p.find_all("p"):
article.append(q.text)
return article
def buzzclean(line):
dirtychars="]["
dubquotechars="“”"
singquotechars="‘’"
line = unicodedata.normalize("NFKD", line)
for _ in line:
if _ in dirtychars:
line = line.replace(_,"")
if _ in dubquotechars:
line = line.replace(_,'"')
if _ in singquotechars:
line = line.replace(_,"'")
if '{\n "id": 0\n }' in line:
line = line.replace('{\n "id": 0\n }',' ')
return line
#to download and clean an article:
if __name_ == "__main__":
url = 'https://www.buzzfeed.com/dominicholden/a-gay-couple-just-asked-the-supreme-court-to-rule-against-a?utm_term=.bceyRpPgQ#.gi8NJyM1K'
article = scrape_buzzfeed_article(url)
article = [buzzclean(line) for line in article]
sentences = []
sentences = [re.sub("\s+"," ", line) for line in article]
punkt = nltk.data.load('tokenizers/punkt/english.pickle')
sentences = [punkt.tokenize(sentence.lower()) for sentence in sentences]
sentences = list(chain.from_iterable(sentences))
article = ' '.join(sentences)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment