Skip to content

Instantly share code, notes, and snippets.

View aaronkub's full-sized avatar

Aaron Kub aaronkub

View GitHub Profile
@aaronkub
aaronkub / README.md
Created June 3, 2021 16:57 — forked from davideicardi/README.md
Write and read Avro records from bytes array

Avro serialization

There are 4 possible serialization format when using avro:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
stop_words = ['in', 'of', 'at', 'a', 'the']
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3), stop_words=stop_words)
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(reviews_train_clean)
X = tfidf_vectorizer.transform(reviews_train_clean)
X_test = tfidf_vectorizer.transform(reviews_test_clean)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
wc_vectorizer = CountVectorizer(binary=False)
wc_vectorizer.fit(reviews_train_clean)
X = wc_vectorizer.transform(reviews_train_clean)
X_test = wc_vectorizer.transform(reviews_test_clean)
X_train, X_val, y_train, y_val = train_test_split(
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)
from nltk.corpus import stopwords
english_stop_words = stopwords.words('english')
def remove_stop_words(corpus):
removed_stop_words = []
for review in corpus:
removed_stop_words.append(
' '.join([word for word in review.split()
if word not in english_stop_words])
)
def get_lemmatized_text(corpus):
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]
lemmatized_reviews = get_lemmatized_text(reviews_train_clean)
def get_stemmed_text(corpus):
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]
stemmed_reviews = get_stemmed_text(reviews_train_clean)
feature_to_coef = {
word: coef for word, coef in zip(
cv.get_feature_names(), final_model.coef_[0]
)
}
for best_positive in sorted(
feature_to_coef.items(),
key=lambda x: x[1],
reverse=True)[:5]:
print (best_positive)