Skip to content

Instantly share code, notes, and snippets.

#!/bin/bash
# exit on any error
set -o errexit
# exit if an uninitialised variable is used.
set -o nounset
# install nginx.
sudo apt-get install -y nginx
sudo apt-get upgrade
@himlohiya
himlohiya / graphite installation
Created October 26, 2013 22:32
shell commands to configure Graphite
####################################
# Last tested & updated 27/10/2013
####################################
sudo apt-get update
sudo apt-get upgrade
wget http://launchpad.net/graphite/0.9/0.9.9/+download/graphite-web-0.9.9.tar.gz
wget http://launchpad.net/graphite/0.9/0.9.9/+download/carbon-0.9.9.tar.gz
wget http://launchpad.net/graphite/0.9/0.9.9/+download/whisper-0.9.9.tar.gz
vagrant init precise32 http://files.vagrantup.com/precise32.box
##########################
# ssh to vagrant machine
##########################
vagrant ssh
##########################
# to configure directory to use vagrant
seed_urls = ['https://inshorts.com/en/read/technology',
'https://inshorts.com/en/read/sports',
'https://inshorts.com/en/read/world']
def build_dataset(seed_urls):
news_data = []
for url in seed_urls:
news_category = url.split('/')[-1]
data = requests.get(url)
soup = BeautifulSoup(data.content, 'html.parser')
def strip_html_tags(text):
soup = BeautifulSoup(text, "html.parser")
stripped_text = soup.get_text()
return stripped_text
def remove_accented_chars(text):
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
return text
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
flags=re.IGNORECASE|re.DOTALL)
def expand_match(contraction):
match = contraction.group(0)
first_char = match[0]
expanded_contraction = contraction_mapping.get(match)\
if contraction_mapping.get(match)\
else contraction_mapping.get(match.lower())
def remove_special_characters(text, remove_digits=False):
pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
text = re.sub(pattern, '', text)
return text
def simple_stemmer(text):
ps = nltk.porter.PorterStemmer()
text = ' '.join([ps.stem(word) for word in text.split()])
return text
def lemmatize_text(text):
text = nlp(text)
text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
return text
def remove_stopwords(text, is_lower_case=False):
tokens = tokenizer.tokenize(text)
tokens = [token.strip() for token in tokens]
if is_lower_case:
filtered_tokens = [token for token in tokens if token not in stopword_list]
else:
filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
filtered_text = ' '.join(filtered_tokens)
return filtered_text