Skip to content

Instantly share code, notes, and snippets.

@shenwei356
Last active June 17, 2016 02:08
Show Gist options
  • Save shenwei356/fffa45551b44e16f5cb386f6d445d837 to your computer and use it in GitHub Desktop.
Save shenwei356/fffa45551b44e16f5cb386f6d445d837 to your computer and use it in GitHub Desktop.
quering by item list
#!/usr/bin/env python
import argparse
import random
parser = argparse.ArgumentParser(description="generate test data",
epilog="https://github.com/shenwei356/")
parser.add_argument('-n',
type=int,
default=700000,
help='number of query list')
parser.add_argument('-m',
type=int,
default=3000000,
help='number of subject list')
parser.add_argument('-m2',
type=int,
default=4,
help='number of items of one line in subject list')
parser.add_argument('-s1', type=int, default=5, help='minimum size of item')
parser.add_argument('-s2', type=int, default=10, help='minimum size of item')
parser.add_argument('-fq', '--query-file', type=str, default="query.txt", help='query file')
parser.add_argument('-fs', '--subject-file', type=str, default="subject.txt", help='subject file')
args = parser.parse_args()
alphabet = 'abcdefghijklmnopqrstuvwxyz0123456789_-'
alphabet_size = len(alphabet)
random.seed = 1
def generate_item(s1, s2):
return "".join([alphabet[random.randint(0, alphabet_size - 1)]
for i in range(s1+random.randint(0, s2-s1))])
query_data = set()
while(len(query_data)<args.n):
query_data.add(generate_item(args.s1, args.s2))
with open(args.query_file, 'w') as fh:
for item in query_data:
fh.write('{}\n'.format(item))
subject_data = query_data
while(len(subject_data)<args.m):
subject_data.add(generate_item(args.s1, args.s2))
with open(args.subject_file, 'w') as fh:
for item in subject_data:
items = [generate_item(args.s1, args.s2) for i in range(args.m2)]
items.append(item)
random.shuffle(items)
fh.write('{}\n'.format('; '.join((items))))
#!/usr/bin/env python
import argparse
import random
import sys
parser = argparse.ArgumentParser(description="quering by item list",
epilog="https://github.com/shenwei356/")
parser.add_argument('--query-file', type=str, default="query.txt", help='query file')
parser.add_argument('--subject-file', type=str, default="subject.txt", help='subject file')
parser.add_argument('--result-file', type=str, default="result.txt", help='result file')
parser.add_argument('-v', action='store_true', help='verbosely print information')
args = parser.parse_args()
sys.stderr.write('reading query list...\n')
query_data = set()
with open(args.query_file) as fh:
for line in fh:
query_data.add(line.strip())
sys.stderr.write('{} items loaded\n'.format(len(query_data)))
sys.stderr.write("start querying...\n")
with open(args.result_file, 'w') as fout:
with open(args.subject_file) as fh:
i, hit = 0, 0
for line in fh:
i += 1
for item in line.strip().split('; '):
if item in query_data:
fout.write(line)
hit+=1
if args.v:
sys.stderr.write('hit {} from {}\n'.format(hit, i))
break
sys.stdout.write('hit {} from {}\n'.format(hit, i))

generate data

$ memusg -t ./generate_data.py

elapsed time: 4m:22s
peak rss: 333.0 MB

$ head -n 3 query.txt
6p3orr
hfoc14r
paq_dm

$ head -n 3 subject.txt
yd0msg; wfcou55rt; rp70424ik; ovuts5; 6p3orr
1202t; f5v9343bc4; op7xyup_4; oap222d; eta1d
9u4quwr; f4brcrkby; 53wmkvny4; aoaod5qi6u; 2k962

searching

$ memusg -t ./query.py
reading query list...
700000 items loaded
start querying...
hit 702298 from 3000000

elapsed time: 6.772s
peak rss: 84.36 MB

using csvtk

csvtk

// delete space after ";"
$ sed 's/; /;/g' subject.txt  > subject2.txt

$ memusg -t csvtk -d ';' grep -F -f "*" -P query.txt subject2.txt > result2.txt

elapsed time: 7.990s                                                                                                   
peak rss: 90.36 MB 

csvtk uses regular expression to check all columns, so it was slower ~

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment