Skip to content

Instantly share code, notes, and snippets.

@dnabanita7
Created June 4, 2019 13:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dnabanita7/b3c83c58f154c2ce0cc73950e71f8baa to your computer and use it in GitHub Desktop.
Save dnabanita7/b3c83c58f154c2ce0cc73950e71f8baa to your computer and use it in GitHub Desktop.
pd.concat can append dataframes but it is showing an error
Display the source blob
Display the rendered blob
Raw
import os
import sys
import errno
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
try:
import cPickle as pickle
except ImportError:
import pickle
np.set_printoptions(suppress=True)
if len(sys.argv) != 3 and len(sys.argv) != 5:
sys.stderr.write('Arguments error. Usage:\n')
sys.stderr.write('\tpython featurization.py data-dir-path features-dir-path\n')
sys.exit(1)
train_input = os.path.join(sys.argv[1], 'train.tsv')
test_input = os.path.join(sys.argv[1], 'test.tsv')
train_output = os.path.join(sys.argv[2], 'train.pkl')
test_output = os.path.join(sys.argv[2], 'test.pkl')
try:
reload(sys)
sys.setdefaultencoding('utf-8')
except NameError:
pass
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
def save_matrix(df, matrix, output):
id_matrix = sparse.csr_matrix(df.id.astype(np.int64)).T
label_matrix = sparse.csr_matrix(df.label.astype(np.int64)).T
result = sparse.hstack([id_matrix, label_matrix, matrix], format='csr')
msg = 'The output matrix {} size is {} and data type is {}\n'
sys.stderr.write(msg.format(output, result.shape, result.dtype))
with open(output, 'wb') as fd:
pickle.dump(result, fd, pickle.HIGHEST_PROTOCOL)
pass
mkdir_p(sys.argv[2])
# Generate train feature matrix
for tp in pd.read_csv(
train_input,
encoding='utf-8',
header=None,
sep='\t',
names=['id', 'label', 'text'],
iterator=True,
chunksize=1000
):
train_words = np.array(tp.text.str.lower().values.astype('U'))
bag_of_words = CountVectorizer(stop_words='english',
max_features=5000)
bag_of_words.fit(train_words)
train_words_binary_matrix = bag_of_words.transform(train_words)
tfidf = TfidfTransformer(smooth_idf=False)
tfidf.fit(train_words_binary_matrix)
tp_tfidf_matrix = tfidf.transform(train_words_binary_matrix)
df_train = df_train.append(tp, ignore_index = True)
train_words_tfidf_matrix = train_words_tfidf_matrix.append(tp_tfidf_matrix, ignore_index=True)
save_matrix(df_train, train_words_tfidf_matrix, train_output)
# Generate test feature matrix
for tp in pd.read_csv(test_input,
encoding='utf-8',
header=None,
sep='\t',
names=['id', 'label', 'text'],
iterator=True,
chunksize=50):
test_words = np.array(tp.text.str.lower().values.astype('U'))
test_words_binary_matrix = bag_of_words.transform(test_words)
tp_tfidf_matrix = tfidf.transform(test_words_binary_matrix)
df_test = df_test.append(tp, ignore_index=True)
test_words_tfidf_matrix = test_words_tfidf_matrix.append(tp_tfidf_matrix, ignore_index=True)
save_matrix(df_test, test_words_tfidf_matrix, test_output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment