Skip to content

Instantly share code, notes, and snippets.

@ruanbekker
Last active July 18, 2016 09:53
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ruanbekker/d56781f97901c6ba7f04ffab7c3a7da8 to your computer and use it in GitHub Desktop.
Save ruanbekker/d56781f97901c6ba7f04ffab7c3a7da8 to your computer and use it in GitHub Desktop.
Generates CSV Data using the Faker library
#!/usr/bin/python
# example usage: ./generate-csv-data.py --filename data --number-runs 10000 --number-reiterations 5
from faker import Factory
import sys
import time
errInvalidArgs = "Usage: " + sys.argv[0] + " --filename" + " [STRING] " + " --number-runs" + " [INT] " + "--number-reiterations" + " [INT] "
errEg = " -> eg: " + sys.argv[0] + " --filename" + " dataset" + " --number-runs" + " 1000000 " + "--number-reiterations " + "5"
errOutput = "Outputs: dataset-timestamp.txt"
def create_names(numberRuns, file_object, fake):
for x in range(numberRuns):
genUname = fake.uuid4()
genName = fake.first_name()
genSurname = fake.last_name()
genCountry = fake.country()
file_object.write(genUname + "," + genName + "," + genSurname + "," + genCountry + "\n")
# [file_object.write("%s, %s, %s, %s\n"% (fake.uuid4(), fake.first_name(), fake.last_name(), fake.country()
#)) for x in range(numberRuns)]
if __name__ == "__main__":
if len(sys.argv) != 7:
print(errInvalidArgs)
print(errEg)
print(errOutput)
exit(-1)
if sys.argv[1] != "--filename" and sys.argv[3] != "--number-runs":
print(errInvalidArgs)
print(errEg)
print(errOutput)
exit(-1)
numCall = int(sys.argv[-1])
for i in range(numCall):
timestart = time.strftime("%Y%m%d%H%M%S")
destFile = sys.argv[2] + "-" + timestart + ".txt"
print "Creating File: " + destFile
#print ("Started at: " + timestart)
numberRuns = int(sys.argv[4])
fake = Factory.create()
file_object = open(destFile,"a")
file_object.write("uuid" + "," + "username" + "," + "name" + "," + "country" + "\n")
create_names(numberRuns, file_object, fake)
file_object.close()
timefinish = time.strftime("%Y%m%d%H%M%S")
#print ("Finished at: " + timefinish)
print ("Generated " + str(numberRuns) + " Records")
timeDuration = int(timefinish)-int(timestart)
print "Job took:", float(timeDuration), "seconds"
average = int(numberRuns)/int(timeDuration)
print "That is", average, "Records per second!"
print "\n"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment