abehmiel/sbuzz.py

## sbuzz.py
from bs4 import BeautifulSoup
import requests

# for cleaning:
import re
import string
import nltk
from itertools import chain

def scrape_buzzfeed_article(url):
    """
    inputs:  a valid buzzfeed article url
    returns: a list of strings, which are the paragraphs
             containing the text of the article
    """

    this_page = requests.get(url)
    soup = BeautifulSoup(this_page.content, 'lxml')
    article = []
    for p in soup.find_all(class_='subbuzz-text'):
        for q in p.find_all("p"):
            article.append(q.text)

    return article

def buzzclean(line):
    dirtychars="]["
    dubquotechars="“”"
    singquotechars="‘’"
    line = unicodedata.normalize("NFKD", line)
    for _ in line:
        if _ in dirtychars:
            line = line.replace(_,"")
        if _ in dubquotechars:
            line = line.replace(_,'"')
        if _ in singquotechars:
            line = line.replace(_,"'")
    if '{\n    "id": 0\n  }' in line:
        line = line.replace('{\n    "id": 0\n  }',' ')
    return line

#to download and clean an article:

if __name_ == "__main__":
    url = 'https://www.buzzfeed.com/dominicholden/a-gay-couple-just-asked-the-supreme-court-to-rule-against-a?utm_term=.bceyRpPgQ#.gi8NJyM1K'
    article = scrape_buzzfeed_article(url)
    article = [buzzclean(line) for line in article]
    sentences = []
    sentences = [re.sub("\s+"," ", line) for line in article]
    punkt = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = [punkt.tokenize(sentence.lower()) for sentence in sentences]
    sentences = list(chain.from_iterable(sentences))
    article = ' '.join(sentences)
	from bs4 import BeautifulSoup
	import requests

	# for cleaning:
	import re
	import string
	import nltk
	from itertools import chain

	def scrape_buzzfeed_article(url):
	"""
	inputs: a valid buzzfeed article url
	returns: a list of strings, which are the paragraphs
	containing the text of the article
	"""

	this_page = requests.get(url)
	soup = BeautifulSoup(this_page.content, 'lxml')
	article = []
	for p in soup.find_all(class_='subbuzz-text'):
	for q in p.find_all("p"):
	article.append(q.text)

	return article

	def buzzclean(line):
	dirtychars="]["
	dubquotechars="“”"
	singquotechars="‘’"
	line = unicodedata.normalize("NFKD", line)
	for _ in line:
	if _ in dirtychars:
	line = line.replace(_,"")
	if _ in dubquotechars:
	line = line.replace(_,'"')
	if _ in singquotechars:
	line = line.replace(_,"'")
	if '{\n "id": 0\n }' in line:
	line = line.replace('{\n "id": 0\n }',' ')
	return line

	#to download and clean an article:

	if __name_ == "__main__":
	url = 'https://www.buzzfeed.com/dominicholden/a-gay-couple-just-asked-the-supreme-court-to-rule-against-a?utm_term=.bceyRpPgQ#.gi8NJyM1K'
	article = scrape_buzzfeed_article(url)
	article = [buzzclean(line) for line in article]
	sentences = []
	sentences = [re.sub("\s+"," ", line) for line in article]
	punkt = nltk.data.load('tokenizers/punkt/english.pickle')
	sentences = [punkt.tokenize(sentence.lower()) for sentence in sentences]
	sentences = list(chain.from_iterable(sentences))
	article = ' '.join(sentences)