Himanshu Lohiya himlohiya

## gist:9b4ee952c6a51fd6de23
#!/bin/bash

# exit on any error
set -o errexit
# exit if an uninitialised variable is used.
set -o nounset

# install nginx.
sudo apt-get install -y nginx
sudo apt-get upgrade

## graphite installation
####################################
# Last tested & updated 27/10/2013
####################################

sudo apt-get update
sudo apt-get upgrade

wget http://launchpad.net/graphite/0.9/0.9.9/+download/graphite-web-0.9.9.tar.gz
wget http://launchpad.net/graphite/0.9/0.9.9/+download/carbon-0.9.9.tar.gz
wget http://launchpad.net/graphite/0.9/0.9.9/+download/whisper-0.9.9.tar.gz

## vagrant
vagrant init precise32 http://files.vagrantup.com/precise32.box

##########################
# ssh to vagrant machine
##########################

vagrant ssh

##########################
# to configure directory to use vagrant

## scraping_news_articles1.py
seed_urls = ['https://inshorts.com/en/read/technology',
             'https://inshorts.com/en/read/sports',
             'https://inshorts.com/en/read/world']

def build_dataset(seed_urls):
    news_data = []
    for url in seed_urls:
        news_category = url.split('/')[-1]
        data = requests.get(url)
        soup = BeautifulSoup(data.content, 'html.parser')

## remove_html_tags.py
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

## remove_accented_chars.py
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

## expand_contractions.py
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):

    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())

## remove_special_characters.py
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

## stemmer_lemmatize_text.py
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

## remove_stopwords.py
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text
	#!/bin/bash

	# exit on any error
	set -o errexit
	# exit if an uninitialised variable is used.
	set -o nounset

	# install nginx.
	sudo apt-get install -y nginx
	sudo apt-get upgrade
	####################################
	# Last tested & updated 27/10/2013
	####################################

	sudo apt-get update
	sudo apt-get upgrade

	wget http://launchpad.net/graphite/0.9/0.9.9/+download/graphite-web-0.9.9.tar.gz
	wget http://launchpad.net/graphite/0.9/0.9.9/+download/carbon-0.9.9.tar.gz
	wget http://launchpad.net/graphite/0.9/0.9.9/+download/whisper-0.9.9.tar.gz
	vagrant init precise32 http://files.vagrantup.com/precise32.box

	##########################
	# ssh to vagrant machine
	##########################

	vagrant ssh

	##########################
	# to configure directory to use vagrant
	seed_urls = ['https://inshorts.com/en/read/technology',
	'https://inshorts.com/en/read/sports',
	'https://inshorts.com/en/read/world']

	def build_dataset(seed_urls):
	news_data = []
	for url in seed_urls:
	news_category = url.split('/')[-1]
	data = requests.get(url)
	soup = BeautifulSoup(data.content, 'html.parser')
	def strip_html_tags(text):
	soup = BeautifulSoup(text, "html.parser")
	stripped_text = soup.get_text()
	return stripped_text
	def remove_accented_chars(text):
	text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
	return text
	def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):

	contractions_pattern = re.compile('({})'.format('\|'.join(contraction_mapping.keys())),
	flags=re.IGNORECASE\|re.DOTALL)
	def expand_match(contraction):
	match = contraction.group(0)
	first_char = match[0]
	expanded_contraction = contraction_mapping.get(match)\
	if contraction_mapping.get(match)\
	else contraction_mapping.get(match.lower())
	def remove_special_characters(text, remove_digits=False):
	pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
	text = re.sub(pattern, '', text)
	return text
	def simple_stemmer(text):
	ps = nltk.porter.PorterStemmer()
	text = ' '.join([ps.stem(word) for word in text.split()])
	return text

	def lemmatize_text(text):
	text = nlp(text)
	text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
	return text
	def remove_stopwords(text, is_lower_case=False):
	tokens = tokenizer.tokenize(text)
	tokens = [token.strip() for token in tokens]
	if is_lower_case:
	filtered_tokens = [token for token in tokens if token not in stopword_list]
	else:
	filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
	filtered_text = ' '.join(filtered_tokens)
	return filtered_text