andreasvc/Makefile

## fm-index.cpp
#include <sdsl/suffix_arrays.hpp>
#include <string>
#include <iostream>
#include <algorithm>
#include <iomanip>
#include <sdsl/vectors.hpp>
#include <fstream>
#include <sstream>
#include <unordered_map>

using namespace sdsl;
using namespace std;

int main(int argc, char** argv) {
    if (argc <  2) {
        cout << "Usage " << argv[0] << " text_file [max_locations]" << endl;
        cout << "    This program constructs a very compact FM-index" << endl;
        cout << "    which supports count, locate, and extract queries." << endl;
        cout << "    text_file      Original text file." << endl;
        cout << "    max_locations  Maximal number of location to report; 0 to disable [default 1000]." <<endl;
        cout << "input: send queries to sdin; one query per line of space separated tokens" << endl;
        cout << "output: lines of the form '[index1, index2, ...]'" << endl;
        cout << "the output corresponds to the lines of input." << endl;
        return 1;
    }
    size_t max_locations = 1000;
    size_t sentcnt = 0;
    int cnt = 2;
    if (argc >= 3)
        max_locations = atoi(argv[2]);

    unordered_map<string, int> mapping;
    vector<int> data;
    vector<int> wordno2sentno;
    string line;

    // create mapping
    // convert argv[1] to vector
    // store sentence beginings as '0' token
    ifstream is(argv[1]);
    while (getline(is, line)) {
        istringstream linestream(line);
        string word;
        data.push_back(2);  // start of sentence
        wordno2sentno.push_back(sentcnt);
        while (linestream >> word) {
            if (mapping.find(word) == mapping.end()) {
                cnt++;
                mapping[word] = cnt;
            }
            data.push_back(mapping[word]);
            // simple but space inefficient sent. no. table:
            wordno2sentno.push_back(sentcnt);
        }
        sentcnt++;
    }

    string index_suffix = ".fm9";
    string index_file = string(argv[1]) + index_suffix;
    csa_wt<wt_int<rrr_vector<127> > > fm_index;

    if (!load_from_file(fm_index, index_file)) {
        ifstream in(argv[1]);
        if (!in) {
            cerr << "ERROR: File " << argv[1] << " does not exist. Exit." << endl;
            return 1;
        }
        cerr << "No index " << index_file << " located. Building index now." << endl;

        // copy to compressed vector (maybe store in there directly?)
        int_vector<> data2(data.size());
        for (int i=0; i < data.size(); i++)
            if (data[i] == 0)
                cerr << "Zero at " << i << endl;
            else
                data2[i] = data[i];

        // create suffix tree from int_vector
        construct_im(fm_index, data2);

        // alternatively, write int vector to file
        // string int_suffix = ".int";
        // string int_file = string(argv[1]) + int_suffix;
        // store_to_file(data2, int_file);

        // create suffix tree from file
        // construct(fm_index, int_file, 0);

        store_to_file(fm_index, index_file);
        cerr << "Index construction complete, index requires ";
        cerr << size_in_mega_bytes(fm_index) << " MiB." << endl;
    }
    cerr << "Input search terms and press Ctrl-D to exit." << endl;
    while (getline(cin, line)) {
        // apply mapping to query
        vector<int> query;
        istringstream linestream(line);
        string word;
        while (linestream >> word) {
            auto search = mapping.find(word);
            if (search == mapping.end())
                query.push_back(1);  // unknown word, not part of mapping
            else
                query.push_back(search->second);
        }
        // perform query on suffix array
        size_t m  = query.size();
        size_t occs = sdsl::count(fm_index, query.begin(), query.end());
        cout << "[";
        if (occs > 0) {
            auto locations = locate(fm_index, query.begin(), query.begin() + m);
            // optional: sort sentence numbers
            // sort(locations.begin(), locations.end());
            if (max_locations > 0)
                occs = min(occs, max_locations);
            if (occs > 0)
                cout << wordno2sentno[locations[0]];
            for (size_t i = 1; i < occs; ++i)
                cout << ", " << wordno2sentno[locations[i]];
        }
        cout << "]" << endl;
    }
}

## getindices.sh
#!/bin/sh
cd enit
zcat enit-kernels.table.m2.gz | python ../splitsubstr.py

cd ../ennl
zcat ennl-kernels.table.m2.gz | python ../splitsubstr.py
cd ..

./fm-index enit/train.tags.en-it.clean.tok.lc.en 0 < enit/source.txt | pv -l -t -r -b > enit/source.txt.idx &
./fm-index enit/train.tags.en-it.clean.tok.lc.it 0 < enit/target.txt | pv -l -t -r -b > enit/target.txt.idx &

./fm-index ennl/train.tags.nl-en.clean.tok.lc.en 0 < ennl/source.txt | pv -l -t -r -b > ennl/source.txt.idx &
./fm-index ennl/train.tags.nl-en.clean.tok.lc.nl 0 < ennl/target.txt | pv -l -t -r -b > ennl/target.txt.idx &

wait
gzip */*.txt */*.txt.idx

## Makefile

# requires sidsl:
# git clone https://github.com/simongog/sdsl-lite.git
# cd sdsl-lite
# ./install.sh $HOME/.local

# uses pv to display progress (not essential)
# http://www.ivarch.com/programs/pv.shtml

all: fm-index indices

fm-index:
	g++ fm-index.cpp -o fm-index -std=c++11 -DNDEBUG -O3 -msse4.2 \
		-lsdsl -ldivsufsort -ldivsufsort64 \
		-I${HOME}/.local/include -L${HOME}/.local/lib

## splitsubstr.py
"""Read substring table from stdin and produce 2 files source.txt, and
target.txt, with a list of unique substrings."""
import sys

target = set()
with open('source.txt', 'w') as src:
	for line in sys.stdin:
		if line.startswith('\t\t['):
			target.add(line[3:line.index(']\t', 2)])
		elif line.startswith('\t['):
			src.write(' '.join(
					line[2:line.index(']', 2)].split(', ')) + '\n')

with open('target.txt', 'w') as tar:
	tar.writelines(' '.join(a.split(', ')) + '\n' for a in target)
	#include <sdsl/suffix_arrays.hpp>
	#include <string>
	#include <iostream>
	#include <algorithm>
	#include <iomanip>
	#include <sdsl/vectors.hpp>
	#include <fstream>
	#include <sstream>
	#include <unordered_map>

	using namespace sdsl;
	using namespace std;

	int main(int argc, char** argv) {
	if (argc < 2) {
	cout << "Usage " << argv[0] << " text_file [max_locations]" << endl;
	cout << " This program constructs a very compact FM-index" << endl;
	cout << " which supports count, locate, and extract queries." << endl;
	cout << " text_file Original text file." << endl;
	cout << " max_locations Maximal number of location to report; 0 to disable [default 1000]." <<endl;
	cout << "input: send queries to sdin; one query per line of space separated tokens" << endl;
	cout << "output: lines of the form '[index1, index2, ...]'" << endl;
	cout << "the output corresponds to the lines of input." << endl;
	return 1;
	}
	size_t max_locations = 1000;
	size_t sentcnt = 0;
	int cnt = 2;
	if (argc >= 3)
	max_locations = atoi(argv[2]);

	unordered_map<string, int> mapping;
	vector<int> data;
	vector<int> wordno2sentno;
	string line;

	// create mapping
	// convert argv[1] to vector
	// store sentence beginings as '0' token
	ifstream is(argv[1]);
	while (getline(is, line)) {
	istringstream linestream(line);
	string word;
	data.push_back(2); // start of sentence
	wordno2sentno.push_back(sentcnt);
	while (linestream >> word) {
	if (mapping.find(word) == mapping.end()) {
	cnt++;
	mapping[word] = cnt;
	}
	data.push_back(mapping[word]);
	// simple but space inefficient sent. no. table:
	wordno2sentno.push_back(sentcnt);
	}
	sentcnt++;
	}

	string index_suffix = ".fm9";
	string index_file = string(argv[1]) + index_suffix;
	csa_wt<wt_int<rrr_vector<127> > > fm_index;

	if (!load_from_file(fm_index, index_file)) {
	ifstream in(argv[1]);
	if (!in) {
	cerr << "ERROR: File " << argv[1] << " does not exist. Exit." << endl;
	return 1;
	}
	cerr << "No index " << index_file << " located. Building index now." << endl;

	// copy to compressed vector (maybe store in there directly?)
	int_vector<> data2(data.size());
	for (int i=0; i < data.size(); i++)
	if (data[i] == 0)
	cerr << "Zero at " << i << endl;
	else
	data2[i] = data[i];

	// create suffix tree from int_vector
	construct_im(fm_index, data2);

	// alternatively, write int vector to file
	// string int_suffix = ".int";
	// string int_file = string(argv[1]) + int_suffix;
	// store_to_file(data2, int_file);

	// create suffix tree from file
	// construct(fm_index, int_file, 0);

	store_to_file(fm_index, index_file);
	cerr << "Index construction complete, index requires ";
	cerr << size_in_mega_bytes(fm_index) << " MiB." << endl;
	}
	cerr << "Input search terms and press Ctrl-D to exit." << endl;
	while (getline(cin, line)) {
	// apply mapping to query
	vector<int> query;
	istringstream linestream(line);
	string word;
	while (linestream >> word) {
	auto search = mapping.find(word);
	if (search == mapping.end())
	query.push_back(1); // unknown word, not part of mapping
	else
	query.push_back(search->second);
	}
	// perform query on suffix array
	size_t m = query.size();
	size_t occs = sdsl::count(fm_index, query.begin(), query.end());
	cout << "[";
	if (occs > 0) {
	auto locations = locate(fm_index, query.begin(), query.begin() + m);
	// optional: sort sentence numbers
	// sort(locations.begin(), locations.end());
	if (max_locations > 0)
	occs = min(occs, max_locations);
	if (occs > 0)
	cout << wordno2sentno[locations[0]];
	for (size_t i = 1; i < occs; ++i)
	cout << ", " << wordno2sentno[locations[i]];
	}
	cout << "]" << endl;
	}
	}
	#!/bin/sh
	cd enit
	zcat enit-kernels.table.m2.gz \| python ../splitsubstr.py

	cd ../ennl
	zcat ennl-kernels.table.m2.gz \| python ../splitsubstr.py
	cd ..

	./fm-index enit/train.tags.en-it.clean.tok.lc.en 0 < enit/source.txt \| pv -l -t -r -b > enit/source.txt.idx &
	./fm-index enit/train.tags.en-it.clean.tok.lc.it 0 < enit/target.txt \| pv -l -t -r -b > enit/target.txt.idx &

	./fm-index ennl/train.tags.nl-en.clean.tok.lc.en 0 < ennl/source.txt \| pv -l -t -r -b > ennl/source.txt.idx &
	./fm-index ennl/train.tags.nl-en.clean.tok.lc.nl 0 < ennl/target.txt \| pv -l -t -r -b > ennl/target.txt.idx &

	wait
	gzip /.txt /.txt.idx

	# requires sidsl:
	# git clone https://github.com/simongog/sdsl-lite.git
	# cd sdsl-lite
	# ./install.sh $HOME/.local

	# uses pv to display progress (not essential)
	# http://www.ivarch.com/programs/pv.shtml

	all: fm-index indices

	fm-index:
	g++ fm-index.cpp -o fm-index -std=c++11 -DNDEBUG -O3 -msse4.2 \
	-lsdsl -ldivsufsort -ldivsufsort64 \
	-I${HOME}/.local/include -L${HOME}/.local/lib
	"""Read substring table from stdin and produce 2 files source.txt, and
	target.txt, with a list of unique substrings."""
	import sys

	target = set()
	with open('source.txt', 'w') as src:
	for line in sys.stdin:
	if line.startswith('\t\t['):
	target.add(line[3:line.index(']\t', 2)])
	elif line.startswith('\t['):
	src.write(' '.join(
	line[2:line.index(']', 2)].split(', ')) + '\n')

	with open('target.txt', 'w') as tar:
	tar.writelines(' '.join(a.split(', ')) + '\n' for a in target)