Skip to content

Instantly share code, notes, and snippets.

@raden
Forked from efaisal/ngram_najmi.py
Last active December 14, 2015 19:18
Show Gist options
  • Save raden/5135153 to your computer and use it in GitHub Desktop.
Save raden/5135153 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import os
from itertools import izip_longest
from collections import Counter
###############################################################################
INPUT_DIR = 'apireport'
OUTPUT_DIR = 'result/ngram'
NGRAMS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
###############################################################################
def ngram(ngram, f):
vectors = []
with open(f) as fp:
lines = fp.readlines()[3:-1]
counter = Counter()
if ngram == 1:
for l in lines:
v = l.strip().split(',')[-1]
counter[v] += 1
if v not in vectors: vectors.append(v)
else:
args = [[l.strip().split(',')[-1] for l in lines]]
for i in range(1, ngram):
args.append(args[0][i:])
for apis in izip_longest(*args):
if None in apis: break
api_n = ','.join(apis)
if api_n not in vectors: vectors.append(api_n)
counter[api_n] += 1
return vectors, counter
def dir_traversal(d):
for dirpath, dirname, files in os.walk(d):
return files
if __name__ == '__main__':
print 'Fetch all files',
filenames = dir_traversal(INPUT_DIR)
try:
os.mkdir(OUTPUT_DIR, 0755)
except:
pass
print '[ Done ]'
for NGRAM in NGRAMS:
subdir = '%02d' % NGRAM
try:
os.mkdir(os.path.join(OUTPUT_DIR, subdir), 0755)
except:
pass
for f in filenames:
for NGRAM in NGRAMS:
subdir = '%02d' % NGRAM
print 'Processing ngram %d for %s' % (NGRAM, f),
vectors, result = ngram(NGRAM, os.path.join(INPUT_DIR, f))
print '[ Done ]'
print 'Writing results for %s' % f,
with open(os.path.join(OUTPUT_DIR, subdir, 'ngram-' + str(NGRAM) + '-' + f), 'w') as fp:
for v in vectors:
fp.write(v + ',' + str(result[v]) + '\n')
print '[ Done ]'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment