pprett/mk_product_cat_dataset.py

## mk_product_cat_dataset.py
#!/usr/bin/python

"""Creates the product category dataset from the Cross-Lingual
Sentiment dataset [1]. The output can be used directly with the
CLSCL reference implementation in NUT [2].

Usage:

  ./mk_product_cat_dataset.py {en|de|fr|jp} {train|test|unlabeled} output_dir num_docs

e.g. use the following line to create the French unlabeled document set:

  ./mk_product_cat_dataset.py fr unlabeled fr/product_category 20000

The product category dataset was used in:

P. Prettenhofer and B. Stein, Cross-lingual adaptation using structural
correspondence learning, ACM TIST (to appear), 2011.


[1] http://www.uni-weimar.de/cms/medien/webis/research/corpora/webis-cls-10.html

[2] http://www.github.com/pprett/nut
"""

import sys

from os import path, mkdir
from itertools import islice


cats = ["books", "dvd", "music"]


def pipenlabel(src, target, label, n=50000):
    """Pipe max `n` lines from `src` to `target` and label each line
    with `label`.
    """
    for line in islice(src, n):
        target.write(line[:line.rindex(":")+1] + label)
        target.write("\n")


def main(argv):
    lang, type, out_dir, n = argv
    n = int(n)
    out_dir = path.normpath(out_dir)

    if not path.exists(out_dir):
        mkdir(out_dir)

    fout = open(path.join(out_dir, "%s.processed" % type), "w+")
    for cat in cats:
        pipenlabel(open(path.join(lang, cat, "%s.processed" % type)),
                   fout, cat, n=n)
    fout.close()

if __name__ == "__main__":
    main(sys.argv[1:])
	#!/usr/bin/python

	"""Creates the product category dataset from the Cross-Lingual
	Sentiment dataset [1]. The output can be used directly with the
	CLSCL reference implementation in NUT [2].

	Usage:

	./mk_product_cat_dataset.py {en\|de\|fr\|jp} {train\|test\|unlabeled} output_dir num_docs

	e.g. use the following line to create the French unlabeled document set:

	./mk_product_cat_dataset.py fr unlabeled fr/product_category 20000

	The product category dataset was used in:

	P. Prettenhofer and B. Stein, Cross-lingual adaptation using structural
	correspondence learning, ACM TIST (to appear), 2011.


	[1] http://www.uni-weimar.de/cms/medien/webis/research/corpora/webis-cls-10.html

	[2] http://www.github.com/pprett/nut
	"""

	import sys

	from os import path, mkdir
	from itertools import islice


	cats = ["books", "dvd", "music"]


	def pipenlabel(src, target, label, n=50000):
	"""Pipe max `n` lines from `src` to `target` and label each line
	with `label`.
	"""
	for line in islice(src, n):
	target.write(line[:line.rindex(":")+1] + label)
	target.write("\n")


	def main(argv):
	lang, type, out_dir, n = argv
	n = int(n)
	out_dir = path.normpath(out_dir)

	if not path.exists(out_dir):
	mkdir(out_dir)

	fout = open(path.join(out_dir, "%s.processed" % type), "w+")
	for cat in cats:
	pipenlabel(open(path.join(lang, cat, "%s.processed" % type)),
	fout, cat, n=n)
	fout.close()

	if __name__ == "__main__":
	main(sys.argv[1:])