Skip to content

Instantly share code, notes, and snippets.

@airawat
Last active December 30, 2015 16:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save airawat/7853279 to your computer and use it in GitHub Desktop.
Save airawat/7853279 to your computer and use it in GitHub Desktop.
Using regex filter in Accumulo Proxy C# client
......
List<String> artifactList = new List<String> ();
var scanOpts = new ScanOptions();
String rowRegex = rowID + ".*";
IteratorSetting iterSttng = new IteratorSetting();
iterSttng.Priority = 15;
iterSttng.Name = "rowIDRegexFilter";
iterSttng.IteratorClass="org.apache.accumulo.core.iterators.user.RegExFilter";
Dictionary<string, string> iterProperties = new Dictionary<string, string> ();
iterProperties.Add ("ROW_REGEX", rowRegex);
iterSttng.Properties=iterProperties;
List<IteratorSetting> listIterSttngs = new List<IteratorSetting> ();
listIterSttngs.Add (iterSttng);
THashSet<byte[]> Auths = new THashSet<byte[]>();
Auths.Add (GetBytes("Public"));
scanOpts.Authorizations=Auths;
try{
String scannerInstance = client.createScanner(loginToken, tableName, scanOpts);
var more = true;
while (more)
{
var scan = client.nextK(scannerInstance, 10);
more = scan.More;
foreach (var entry in scan.Results)
artifactList.Add (GetString(entry.Key.Row));
}
client.closeScanner(scannerInstance);
}catch(Exception e) {
Console.WriteLine (e.StackTrace);
}
http://affy.blogspot.com/2013/03/exampe-using-accumulos-regexfilter-class.html
Courtesy - Dave Medinets
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.SortedMap;
import java.util.TreeMap;
import org.apache.accumulo.core.client.IteratorSetting;
import org.apache.accumulo.core.conf.AccumuloConfiguration;
import org.apache.accumulo.core.data.ByteSequence;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.iterators.IteratorEnvironment;
import org.apache.accumulo.core.iterators.IteratorUtil;
import org.apache.accumulo.core.iterators.SortedKeyValueIterator;
import org.apache.accumulo.core.iterators.SortedMapIterator;
import org.apache.accumulo.core.iterators.system.MapFileIterator;
import org.apache.accumulo.core.iterators.user.RegExFilter;
import org.apache.accumulo.core.util.CachedConfiguration;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.Logger;
public class AccumuloRegExIteratorPlayground {
private final Logger log = Logger.getLogger(AccumuloRegExIteratorPlayground.class);
private static final Collection<ByteSequence> EMPTY_COL_FAMS = new ArrayList<ByteSequence>();
public void process() throws IOException {
final String regularExpression = "/.*";
final SortedMap<Key, Value> input = new TreeMap<Key, Value>();
input.put(new Key("1111", "2222", "3333", 0), new Value("4444".getBytes()));
input.put(new Key("/1111", "2222", "3333", 0), new Value("4444".getBytes()));
final RegExFilter rei = new RegExFilter();
IteratorSetting is = new IteratorSetting(1, RegExFilter.class);
RegExFilter.setRegexs(is, regularExpression, null, null, null, false);
if (!rei.validateOptions(is.getOptions())) {
throw new RuntimeException("invalid options.");
}
rei.init(new SortedMapIterator(input), is.getOptions(), new IteratorEnvironment() {
@Override
public SortedKeyValueIterator<Key, Value> reserveMapFileReader(String mapFileName) throws IOException {
Configuration conf = CachedConfiguration.getInstance();
FileSystem fs = FileSystem.get(conf);
return new MapFileIterator(AccumuloConfiguration.getDefaultConfiguration(), fs, mapFileName, conf);
}
@Override
public AccumuloConfiguration getConfig() {
return AccumuloConfiguration.getDefaultConfiguration();
}
@Override
public IteratorUtil.IteratorScope getIteratorScope() {
throw new UnsupportedOperationException("Not supported yet.");
}
@Override
public boolean isFullMajorCompaction() {
throw new UnsupportedOperationException("Not supported yet.");
}
@Override
public void registerSideChannel(SortedKeyValueIterator<Key, Value> iter) {
throw new UnsupportedOperationException("Not supported yet.");
}
});
rei.seek(new Range(), EMPTY_COL_FAMS, false);
while (rei.hasTop()) {
final Key key = rei.getTopKey();
final Value value = rei.getTopValue();
log.info(key + " --> " + value);
rei.next();
}
}
public static void main(final String[] args) throws IOException {
AccumuloRegExIteratorPlayground driver = new AccumuloRegExIteratorPlayground();
driver.process();
}
}
#!/usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pyaccumulo import Accumulo, Mutation, Range
from pyaccumulo.iterators import *
from pyaccumulo.proxy.ttypes import IteratorSetting, IteratorScope
from examples.util import hashcode
import hashlib, re
import settings
conn = Accumulo(host=settings.HOST, port=settings.PORT, user=settings.USER, password=settings.PASSWORD)
table = "regexes"
if conn.table_exists(table):
conn.delete_table(table)
conn.create_table(table)
wr = conn.create_batch_writer(table)
license_file = "LICENSE"
linenum = 0
with file(license_file) as infile:
for line in infile:
linenum += 1
m = Mutation(str(linenum))
m.put(cf="e", cq="", val=line.strip())
wr.add_mutation(m)
wr.close()
regex1 = RegExFilter(priority=21, val_regex=".*stated.*", match_substring=True, name="RegExFilter1")
regex2 = RegExFilter(priority=22, val_regex='.*patent', match_substring=True, name="RegExFilter2")
regex3 = RegExFilter(priority=23, val_regex='have made', match_substring=True, name="RegExFilter3")
for e in conn.batch_scan(table, cols=[["e"]], iterators=[regex1, regex2, regex3]):
print e
conn.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment