Skip to content

Instantly share code, notes, and snippets.

@flangofas
Last active December 28, 2016 09:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save flangofas/22c5a980bddf80a979ad97179eba4756 to your computer and use it in GitHub Desktop.
Save flangofas/22c5a980bddf80a979ad97179eba4756 to your computer and use it in GitHub Desktop.
Creating JSON with million records
{
"name": "aflangofas/json",
"require": {
"fzaninotto/faker": "^1.6"
},
"authors": [
{
"name": "Antonis Flangofas",
"email": "antonisflags@gmail.com"
}
]
}
<?php
$folder = 'output/';
$files = array_diff(scandir($folder), ['.', '..']);
foreach ($files as $file) {
echo 'Importing ' . $file . PHP_EOL;
$output = shell_exec('mongoimport --db test --collection users --jsonArray --file output/' . $file);
echo $output . PHP_EOL;
}
<?php
require __DIR__ . '/vendor/autoload.php';
use Faker\Factory;
$iterations = isset($argv[1]) ? $argv[1] : 5;
$entries = isset($argv[2]) ? $argv[2] : 6000;
function convert($size)
{
$unit=array('b','kb','mb','gb','tb','pb');
return @round($size/pow(1024,($i=floor(log($size,1024)))),2).' '.$unit[$i];
}
function random_float ($min,$max) {
return ($min+lcg_value()*(abs($max-$min)));
}
function array_rand_value($container = [])
{
if (empty($container)) {
return false;
}
$key = array_rand($container);
return $container[$key];
};
function createEmails($num = 1)
{
$faker = Factory::create();
$emails = [];
do {
$emails[] = [
'value' => $faker->email,
'type' => array_rand_value(['personal', 'work'])
];
$num --;
} while ($num);
return $emails;
}
function createPhoneNumbers($num = 1)
{
$faker = Factory::create();
$numbers = [];
$default = true;
do {
$numbers[] = [
'value' => $faker->phoneNumber(),
'type' => array_rand_value(['personal', 'work']),
'is_Default' => $default
];
$num --;
$default = false;
} while ($num);
return $numbers;
}
function createAddresses($num = 1)
{
$faker = Factory::create();
$addresses = [];
do {
$addresses[] = [
'type' => array_rand_value(['work', 'home']),
'address' => $faker->address(),
'city' => $faker->city(),
'zip' => $faker->postcode(),
'country' => $faker->country(),
'state' => $faker->stateAbbr()
];
$num --;
} while ($num);
return $addresses;
}
function createResults($options)
{
$faker = Factory::create();
extract($options);
$results = [
"finalScore" => number_format(random_float(0.00, 10.00), 2, '.', ''),
"dateSent"=> $serviceDate,
"dateCompleted" => $serviceDate,
"surveyDetails" => [
"On a scale of 0-10, please rate your level of satisfaction with your service(s) (with 10 being the highest level of satisfaction):" => rand(0,10),
"What is the likelihood that you would suggest Vein Clinics of America to a friend or family member interested in similar services? (10 being the greatest likelihood):" => rand(0,10),
"The Office Staff: The Scheduling Process was Easy and Convenient" => array_rand_value(['Disagree Strongly', 'Disagree', 'Neutral', 'Agree', 'Agree Strongly']),
"The Office Staff: Your Greeting and Send-Off were Warm and Professionalt" => array_rand_value(['Disagree Strongly', 'Disagree', 'Neutral', 'Agree', 'Agree Strongly']),
"The Office Staff: Your Wait was Brief and Comfortable" => array_rand_value(['Disagree Strongly', 'Disagree', 'Neutral', 'Agree', 'Agree Strongly']),
"The Office Staff: The Payment Process was Easy and Convenient" => array_rand_value(['Disagree Strongly', 'Disagree', 'Neutral', 'Agree', 'Agree Strongly']),
"What factors were most important in choosing this location for your procedure? (choose any that apply)" => array_rand_value(["Confidence that I would get great results","Positive Consultation Experience","Staff Expertise","Travel (Distance)"]),
"Getting to Know You: For the next few questions, please choose the answer that accurately describes you, starting with Gender:" => array_rand_value(['male', 'female', 'prefer not to say'])
]
];
$emptyChance = rand(0, 4);
if (!$emptyChance) {
return [];
}
return $results;
}
function createEvents($num = 50, $options = [])
{
$events = [];
$faker = Factory::create();
$serviceDate = $faker->dateTimeBetween('-3 years', 'now');
// $completedDate = $serviceDate->add(new DateInterval('P10D'))->format('Y-m-d');
$serviceDate = $serviceDate->format('Y-m-d');
$cId = intval( "0" . rand(1,9) . rand(0,9) . rand(0,9) . rand(0,9) . rand(0,9) . rand(0,9) . rand(0,9));
extract($options);
do {
$events[] = [
'clientBrandID' => $id,
'surveyType' => array_rand_value([5005, 5004, 5003, 5002, 5001]),
'location'=> $city,
'clientContactID' => $cId,
"firstName" => $firstname,
"lastName" => $lastname,
"birthDate" => $dob,
"email" => $email['value'],
"address" => $address,
"city" => $city,
"state" => $state,
"zip" => $zip,
"consultDate" => "",
"consultantName" => "",
"consultServiceType" => "",
"serviceDate" => 2,
"serviceProviderName" => "Carolyn Thas, RN",
"serviceType" => array_rand_value(['MVD', 'DVM', 'MDV', 'DVMM', 'MMM']),
"consultCancellationDate" => "",
"serviceRevenue" => number_format(random_float(0.00,20000.00), 2, '.', ''),
"userVarChar4" => "Insurance",
"leadSource" => array_rand_value(['DTC', 'Phone', 'Website', 'Social Media', 'Other']),
"results" => createResults(array_merge($options, ['serviceDate' => $serviceDate]))
];
$num --;
} while ($num);
return $events;
}
function createClients($options)
{
global $dob, $email;
$result = [];
$clientIds = [35, 67, 73, 78, 82];
$clientIds = array_slice($clientIds, rand(0, count($clientIds) - 1), count($clientIds));
extract($options);
foreach ($clientIds as $id) {
$result[] = [
'clientId' => $id,
'mainInfo' => [
"firstName" => $firstname,
"lastName" => $lastname,
"birthDate" => $dob,
"email" => $email['value'],
"address" => $address,
"city" => $city,
"state" => $state,
"zip" => $zip,
],
'events' => createEvents(rand(1,5), array_merge($options, ['dob' => $dob, 'email' => $email, 'id' => $id]))
];
}
return $result;
}
$faker = Factory::create();
$c = [];
for ($y = 1; $y <= $iterations; $y++) {
for ($i = 0; $i < $entries; $i++) {
//survey taker
$firstname = $faker->firstName;
$lastname = $faker->lastName;
$dob = $faker->dateTimeThisCentury->format('Y-m-d');
$emails = createEmails(rand(1,2));
$email = $emails[0];
$addresses = createAddresses(rand(1,2));
$contact = $addresses[0];
$options = array_merge($contact, ['firstname' => $firstname, 'lastname' => $lastname, 'email' => $email]);
extract($contact);
$container = [
'name' => [
'firstname' => $firstname,
'lastname' => $lastname,
],
'emails' => $emails,
'phoneNumbers' => createPhoneNumbers(rand(1,2)),
'addresses' => $addresses,
'dob' => $dob,
'clients' => createClients($options)
];
$c[] = $container;
}
echo "Batch " . $y . PHP_EOL;
echo "Memory usage " . convert(memory_get_usage(true)) . PHP_EOL;
$fileName = 'user-dataset-' . $y . '.json';
file_put_contents('output/' . $fileName, json_encode($c), JSON_PRETTY_PRINT);
$container = $firstname = $contact = $emails = $addresses = $fileName = $c = null;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment