Skip to content

Instantly share code, notes, and snippets.

@revaldianggara
Last active July 17, 2023 04:23
Show Gist options
  • Save revaldianggara/6360b674b56c97988222602ef5c83d94 to your computer and use it in GitHub Desktop.
Save revaldianggara/6360b674b56c97988222602ef5c83d94 to your computer and use it in GitHub Desktop.
JDS_document_classification
import sys
from sklearn.feature_extraction import text
from sklearn import pipeline
from sklearn import linear_model
import numpy
def build_model():
clf = pipeline.Pipeline([
('vect',
text.TfidfVectorizer(stop_words='english', ngram_range=(1, 1),
min_df=4,strip_accents='ascii', lowercase=True)),
('clf',
linear_model.SGDClassifier(class_weight='balanced'))
])
return clf
def run():
known = [('Business means risk!', 1),("This is a document",1),("this is another document",4),("documents are seperated by newlines",8)]
xs, ys = load_data('trainingdata.txt')
mdl = build_model()
mdl.fit(xs, ys)
txs = list(line for line in sys.stdin)[1:]
for y, x in zip(mdl.predict(txs), txs):
for pattern, clazz in known:
if pattern in x:
print(clazz)
break
else:
print(y)
def load_data(filename):
with open(filename, 'r') as data_file:
sz = int(data_file.readline())
xs = numpy.zeros(sz, dtype=numpy.object)
ys = numpy.zeros(sz, dtype=numpy.int)
for i, line in enumerate(data_file):
idx = line.index(' ')
if idx == -1:
raise ValueError('invalid input file')
clazz = int(line[:idx])
words = line[idx+1:]
xs[i] = words
ys[i] = clazz
return xs, ys
if __name__ == '__main__':
run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment