Skip to content

Instantly share code, notes, and snippets.

class Animal(val name: String){
def speak = "animal speaking"
def eat = "animal eats"
override def toString = "animal: " + name
class Cat(override val name: String) extends Animal(name){
override def speak = "meow"
override def eat = "fish"
def catonly : String = "cat only"
def process_all(text):
text = remove_tags(text)
text = process_single_quote(text)
text = decontract(text)
text = remove_special_chars(text)
text = remove_more_than_single_space(text)
text = text.lower()
text = stem_sentence(text)
return text
-- basic functions
id :: a -> a
const :: a -> b -> a
flip :: (a -> b -> c) -> b -> a -> c
-- function composition
-- compose
(.) :: (b -> c) -> (a -> b) -> a -> c
import nltk'wordnet')
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
wnl = WordNetLemmatizer()
def lemmatize_text(text):
tokens = word_tokenize(text)

"farewel blast for a hotel In pittsburgh A control implos top begin to crumbl the 16-stori carlton hous hotel in downtown pittsburgh saturday morn the 28-vear-old build is be raze to make way for renaiss II the second phase of a major redevelop of the citi thick smoke billow from the build center as more than 1,000 explos charg do their work seven second after it began the demolit is complet bottom construct of a 52-stori offic build ha been propos in place of the hotel where former soviet premier khrushchev stay while visit the citi dure hi tour of the unit state in 1959.roanok upi industri analyst believ recent labor unrest and other problem in poland south africa and australia may lead foreign coal buyer to depend more heavili on the unit state than in recent years.poland is labor chang racial unrest in south africa and a major miner ' strike in australia the world is three largest coal export behind the unit state are spread the he- ' lief that the unit state may be t he mo

import nltk'punkt')
from nltk.stem import PorterStemmer
from nltk import word_tokenize
stemmer = PorterStemmer()
def stem_token(token):
stemmed_token = stemmer.stem(token)

['u.s.a', '.']

from nltk import word_tokenize

['Farewell', 'Blast', 'For', 'a', 'Hotel', 'In',

import nltk
from nltk import word_tokenize
tokens = word_tokenize(text)
tokens = [token for token in tokens if (not isNumber(token) and token!='.' and token!=',') ]
import re
import string
punctuation = string.punctuation
def remove_punctuation(text):
return re.sub(r'[^a-zA-Z]',' ',text)