Cross-lingual product category dataset creation script.
#!/usr/bin/python | |
"""Creates the product category dataset from the Cross-Lingual | |
Sentiment dataset [1]. The output can be used directly with the | |
CLSCL reference implementation in NUT [2]. | |
Usage: | |
./mk_product_cat_dataset.py {en|de|fr|jp} {train|test|unlabeled} output_dir num_docs | |
e.g. use the following line to create the French unlabeled document set: | |
./mk_product_cat_dataset.py fr unlabeled fr/product_category 20000 | |
The product category dataset was used in: | |
P. Prettenhofer and B. Stein, Cross-lingual adaptation using structural | |
correspondence learning, ACM TIST (to appear), 2011. | |
[1] http://www.uni-weimar.de/cms/medien/webis/research/corpora/webis-cls-10.html | |
[2] http://www.github.com/pprett/nut | |
""" | |
import sys | |
from os import path, mkdir | |
from itertools import islice | |
cats = ["books", "dvd", "music"] | |
def pipenlabel(src, target, label, n=50000): | |
"""Pipe max `n` lines from `src` to `target` and label each line | |
with `label`. | |
""" | |
for line in islice(src, n): | |
target.write(line[:line.rindex(":")+1] + label) | |
target.write("\n") | |
def main(argv): | |
lang, type, out_dir, n = argv | |
n = int(n) | |
out_dir = path.normpath(out_dir) | |
if not path.exists(out_dir): | |
mkdir(out_dir) | |
fout = open(path.join(out_dir, "%s.processed" % type), "w+") | |
for cat in cats: | |
pipenlabel(open(path.join(lang, cat, "%s.processed" % type)), | |
fout, cat, n=n) | |
fout.close() | |
if __name__ == "__main__": | |
main(sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment