Skip to content

Instantly share code, notes, and snippets.

@copyleftdev
Created February 26, 2016 05:30
Show Gist options
  • Save copyleftdev/952411d04adb0e64d1ea to your computer and use it in GitHub Desktop.
Save copyleftdev/952411d04adb0e64d1ea to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# This tool will radom select articles from a wikipedia article select
# and radomly seed it with PII data
import random
import os
from faker import Factory
fake = Factory.create()
def dedupe(seq):
seen = set()
seen_add = seen.add
return [ x for x in seq if not (x in seen or seen.add(x))]
def randomDocuments(rootFolder):
folderlst = []
for(dirpath, _, files) in os.walk(rootFolder):
for filename in files:
filepath = os.path.join(dirpath, filename)
folderlst.append(filepath)
return random.choice(folderlst)
def seedDocuments(rootFolder, count, piiFolder):
if not os.path.exists(piiFolder):
os.mkdir(piiFolder)
filelst = []
print('Randomizing {} Documents'.format(count))
while len(filelst) < count:
filelst.append(randomDocuments(rootFolder))
dedupe(filelst)
index = 0
for each_doc in filelst:
index += 1
with open(each_doc, 'r') as seedread:
filecontent = seedread.read()
with open('{}/pii_{}.txt'.format(piiFolder, index), 'w+') as outfile:
file_size = os.stat(each_doc)[6]
outfile.write(filecontent)
outfile.seek((outfile.tell()+random.randint(0, file_size-2))%file_size)
piiproviders = [fake.credit_card_number(), fake.ssn()]
outfile.write(' PII VECTOR ( {} ) '.format(random.choice(piiproviders)))
print('Document creation complete,files are located in folder {}'.format(os.path.abspath(piiFolder)))
seedDocuments('data',10,'needle')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment