Skip to content

Instantly share code, notes, and snippets.

@gangeli
Last active August 29, 2015 13:59
Show Gist options
  • Save gangeli/10570324 to your computer and use it in GitHub Desktop.
Save gangeli/10570324 to your computer and use it in GitHub Desktop.
Fast "sed" for find/replace over a large pattern set
/**
* Fast "sed" for performing find/replace on a huge number of patterns, not
* requiring any actual regular expression matching.
* Note that this is in no way "fully featured" or really anything like sed,
* beyond the fact that I use sed almost exclusively for find/replace and the
* patterns file is nominally compatible with real sed.
*
* To compile: g++ -std=c++0x -O3 fsed.cc -o fsed
*
* Usage: ./fsed <path_to_patterns_file>
*
* Patterns File Format (one pattern per line):
*
* s/input pattern/replacement/g
* s/another pattern/another replacement/g
*
*
* Released under the MIT license:
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <vector>
#include <unordered_map>
#include <boost/algorithm/string.hpp>
#include <boost/tokenizer.hpp>
#define MAX_TOKEN_LENGTH 1024
using namespace std;
unordered_map<string, string> PATTERNS;
struct entry {
uint64_t index;
uint64_t gloss_begin;
uint64_t gloss_length;
};
uint32_t split(const std::string s, string* buffer) {
boost::char_separator<char> sep(" ", "", boost::keep_empty_tokens);
boost::tokenizer<boost::char_separator<char> > tok(s, sep);
int i = 0;
for(boost::tokenizer<boost::char_separator<char> >::iterator beg= tok.begin(); beg!=tok.end(); ++beg) {
buffer[i] = *beg;
i += 1;
if (i >= MAX_TOKEN_LENGTH) { break; }
}
return i;
}
int main(int argc, char** argv) {
// Read Input
if (argc < 2) {
cout << "Usage: sed <expressions_file>" << endl;
return 1;
}
char index_str[32];
string line;
vector<struct entry> entries;
ifstream infile(argv[1]);
line.reserve(1024);
uint64_t buffer_index = 0;
while (std::getline(infile, line)) {
const char* cline = line.c_str();
std::size_t slash_index = line.substr(0, line.length() - 3).find_last_of("/");
std::size_t length = line.length();
string gloss = line.substr(2, slash_index - 2);
string index = line.substr(slash_index + 1, length - 2 - (slash_index + 1));
PATTERNS[gloss] = index;
}
// Run sed
string tokens[MAX_TOKEN_LENGTH];
string buffer;
buffer.reserve(4096);
while (getline(cin, line)) {
// Variables we'll need
const uint32_t num_tokens = split(line, tokens);
bool found[num_tokens];
memset(found, false, num_tokens);
char indices[num_tokens][64];
// Index substrings of the original string
for (int length = num_tokens; length > 0; --length) {
for (int begin = 0; begin <= num_tokens - length; ++begin) {
// Make sure we haven't consumed this span yet
bool do_search = true;
for (int k = begin; k < begin + length; ++k) {
if (found[k]) { do_search = false; break; }
}
if (!do_search) { continue; }
// Create key
buffer.clear();
buffer.append(tokens[begin]);
for (int k = begin + 1; k < begin + length; ++k) {
buffer.push_back(' ');
buffer.append(tokens[k]);
}
// Check key
unordered_map<string,string>::iterator it = PATTERNS.find(buffer);
if (it != PATTERNS.end()) {
string index = it->second;
for (int k = begin; k < begin + length; ++k) {
memcpy(indices[k], index.c_str(), index.length());
indices[k][index.length()] = '\0';
found[k] = true;
}
}
}
}
// Collapse indices
string last_index = "~~~~~~NULL~~~~~~";
for (int k = 0; k < num_tokens; ++k) {
if (!found[k]) {
cout << tokens[k];
last_index = "~~~~~~NULL~~~~~~";
} else if (last_index != string(indices[k])) {
cout << indices[k];
last_index = indices[k];
}
if (k < num_tokens - 1) {
cout << ' ';
}
}
cout << endl;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment