Skip to content

Instantly share code, notes, and snippets.

@greenfigo2015
Forked from fannix/text_to_libsvm.py
Created January 16, 2017 17:36
Show Gist options
  • Save greenfigo2015/b50ea0fee4f4135f8f27c17db0edc14b to your computer and use it in GitHub Desktop.
Save greenfigo2015/b50ea0fee4f4135f8f27c17db0edc14b to your computer and use it in GitHub Desktop.
Convert text format to libsvm format
""" project text to libsvm vector space
"""
from gensim import corpora, models, similarities
import sys
import os.path
def load_document():
text_li = []
for line in sys.stdin:
line = line.strip()
if not line:
continue
text_li.append(line)
#document_li = [text.split() for text in text_li]
# Binary features
document_li = [list(set(text.split())) for text in text_li]
return document_li
def main():
usage = "Usage: %prog outfile vocabfile < infile"
if len(sys.argv) != 3:
print usage
sys.exit(-1)
outfile = sys.argv[1]
vocabfile = sys.argv[2]
document_li = load_document()
vocab_bin = vocabfile + '.bin'
if os.path.exists(vocab_bin):
vocab = corpora.Dictionary.load(vocab_bin)
else:
vocab = corpora.Dictionary(document_li)
vocab.filter_extremes(no_below=3, no_above=0.4)
vocab.save(vocab_bin)
vocab.save_as_text(vocabfile+'.txt')
corpus = [vocab.doc2bow(doc) for doc in document_li]
corpora.SvmLightCorpus.serialize(outfile, corpus)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment