Skip to content

Instantly share code, notes, and snippets.

@andreasvc
Last active August 29, 2015 14:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andreasvc/ab3845503eec3da55585 to your computer and use it in GitHub Desktop.
Save andreasvc/ab3845503eec3da55585 to your computer and use it in GitHub Desktop.
#include <sdsl/suffix_arrays.hpp>
#include <string>
#include <iostream>
#include <algorithm>
#include <iomanip>
#include <sdsl/vectors.hpp>
#include <fstream>
#include <sstream>
#include <unordered_map>
using namespace sdsl;
using namespace std;
int main(int argc, char** argv) {
if (argc < 2) {
cout << "Usage " << argv[0] << " text_file [max_locations]" << endl;
cout << " This program constructs a very compact FM-index" << endl;
cout << " which supports count, locate, and extract queries." << endl;
cout << " text_file Original text file." << endl;
cout << " max_locations Maximal number of location to report; 0 to disable [default 1000]." <<endl;
cout << "input: send queries to sdin; one query per line of space separated tokens" << endl;
cout << "output: lines of the form '[index1, index2, ...]'" << endl;
cout << "the output corresponds to the lines of input." << endl;
return 1;
}
size_t max_locations = 1000;
size_t sentcnt = 0;
int cnt = 2;
if (argc >= 3)
max_locations = atoi(argv[2]);
unordered_map<string, int> mapping;
vector<int> data;
vector<int> wordno2sentno;
string line;
// create mapping
// convert argv[1] to vector
// store sentence beginings as '0' token
ifstream is(argv[1]);
while (getline(is, line)) {
istringstream linestream(line);
string word;
data.push_back(2); // start of sentence
wordno2sentno.push_back(sentcnt);
while (linestream >> word) {
if (mapping.find(word) == mapping.end()) {
cnt++;
mapping[word] = cnt;
}
data.push_back(mapping[word]);
// simple but space inefficient sent. no. table:
wordno2sentno.push_back(sentcnt);
}
sentcnt++;
}
string index_suffix = ".fm9";
string index_file = string(argv[1]) + index_suffix;
csa_wt<wt_int<rrr_vector<127> > > fm_index;
if (!load_from_file(fm_index, index_file)) {
ifstream in(argv[1]);
if (!in) {
cerr << "ERROR: File " << argv[1] << " does not exist. Exit." << endl;
return 1;
}
cerr << "No index " << index_file << " located. Building index now." << endl;
// copy to compressed vector (maybe store in there directly?)
int_vector<> data2(data.size());
for (int i=0; i < data.size(); i++)
if (data[i] == 0)
cerr << "Zero at " << i << endl;
else
data2[i] = data[i];
// create suffix tree from int_vector
construct_im(fm_index, data2);
// alternatively, write int vector to file
// string int_suffix = ".int";
// string int_file = string(argv[1]) + int_suffix;
// store_to_file(data2, int_file);
// create suffix tree from file
// construct(fm_index, int_file, 0);
store_to_file(fm_index, index_file);
cerr << "Index construction complete, index requires ";
cerr << size_in_mega_bytes(fm_index) << " MiB." << endl;
}
cerr << "Input search terms and press Ctrl-D to exit." << endl;
while (getline(cin, line)) {
// apply mapping to query
vector<int> query;
istringstream linestream(line);
string word;
while (linestream >> word) {
auto search = mapping.find(word);
if (search == mapping.end())
query.push_back(1); // unknown word, not part of mapping
else
query.push_back(search->second);
}
// perform query on suffix array
size_t m = query.size();
size_t occs = sdsl::count(fm_index, query.begin(), query.end());
cout << "[";
if (occs > 0) {
auto locations = locate(fm_index, query.begin(), query.begin() + m);
// optional: sort sentence numbers
// sort(locations.begin(), locations.end());
if (max_locations > 0)
occs = min(occs, max_locations);
if (occs > 0)
cout << wordno2sentno[locations[0]];
for (size_t i = 1; i < occs; ++i)
cout << ", " << wordno2sentno[locations[i]];
}
cout << "]" << endl;
}
}
#!/bin/sh
cd enit
zcat enit-kernels.table.m2.gz | python ../splitsubstr.py
cd ../ennl
zcat ennl-kernels.table.m2.gz | python ../splitsubstr.py
cd ..
./fm-index enit/train.tags.en-it.clean.tok.lc.en 0 < enit/source.txt | pv -l -t -r -b > enit/source.txt.idx &
./fm-index enit/train.tags.en-it.clean.tok.lc.it 0 < enit/target.txt | pv -l -t -r -b > enit/target.txt.idx &
./fm-index ennl/train.tags.nl-en.clean.tok.lc.en 0 < ennl/source.txt | pv -l -t -r -b > ennl/source.txt.idx &
./fm-index ennl/train.tags.nl-en.clean.tok.lc.nl 0 < ennl/target.txt | pv -l -t -r -b > ennl/target.txt.idx &
wait
gzip */*.txt */*.txt.idx
# requires sidsl:
# git clone https://github.com/simongog/sdsl-lite.git
# cd sdsl-lite
# ./install.sh $HOME/.local
# uses pv to display progress (not essential)
# http://www.ivarch.com/programs/pv.shtml
all: fm-index indices
fm-index:
g++ fm-index.cpp -o fm-index -std=c++11 -DNDEBUG -O3 -msse4.2 \
-lsdsl -ldivsufsort -ldivsufsort64 \
-I${HOME}/.local/include -L${HOME}/.local/lib
"""Read substring table from stdin and produce 2 files source.txt, and
target.txt, with a list of unique substrings."""
import sys
target = set()
with open('source.txt', 'w') as src:
for line in sys.stdin:
if line.startswith('\t\t['):
target.add(line[3:line.index(']\t', 2)])
elif line.startswith('\t['):
src.write(' '.join(
line[2:line.index(']', 2)].split(', ')) + '\n')
with open('target.txt', 'w') as tar:
tar.writelines(' '.join(a.split(', ')) + '\n' for a in target)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment