Skip to content

Instantly share code, notes, and snippets.

@lucasmenendez
Created August 12, 2022 19:53
Show Gist options
  • Save lucasmenendez/ff9f98f3a64378263d85a3cca037148b to your computer and use it in GitHub Desktop.
Save lucasmenendez/ff9f98f3a64378263d85a3cca037148b to your computer and use it in GitHub Desktop.
Python script to generate 2 uuid's datasets with the 2.5% of records in common to test lucasmenendez/goPSI project. Usage: python generate_datasets.py <number_of_records>
import sys
import random
import uuid
numOfUUIDS = int(sys.argv[1])
def main():
numCommons = int(numOfUUIDS * 0.025)
numRandom = numOfUUIDS - numCommons
print("Generating 2 datasets with %d random UUID's (%d in both)..." % (numOfUUIDS, numCommons))
print("Generating unique datasets records...")
dataA = []
dataB = []
for x in range(numRandom):
dataA.append(str(uuid.uuid4()) + "\n")
dataB.append(str(uuid.uuid4()) + "\n")
print("Generating common datasets records...")
for x in range(numCommons):
common = str(uuid.uuid4()) + "\n"
dataA.append(common)
dataB.append(common)
print("Shuffleling first dataset...")
random.shuffle(dataA)
print("Storing first dataset...")
with open("dataA", "w") as fd:
fd.writelines(dataA)
print("Shuffleling second dataset...")
random.shuffle(dataB)
print("Storing second dataset...")
with open("dataB", "w") as fd:
fd.writelines(dataB)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment