Skip to content

Instantly share code, notes, and snippets.

@psorianom
Last active April 9, 2019 12:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save psorianom/1996422a692f111883450cd36f62e72b to your computer and use it in GitHub Desktop.
Save psorianom/1996422a692f111883450cd36f62e72b to your computer and use it in GitHub Desktop.
'''Genreates a syntethic dataset (csv) of persons to test the SNU_assignator
Usage:
SNU_gen.py <o> [options]
Arguments:
<o> An output path to store the ysntethic data csv
-n PER Number of persons to generate [default: 2000:int]
-f FIL Representation proportion of the filiere. Ex: "0.1,0.1,...,0.1" (default: None)
-r RES Representation proportion of the residence Ex: "0.1,0.1,...,0.1" (default: None)
-s SEX Representation proportion of the residence Ex: "0.3,0.7" (default: None)
'''
import logging
from math import isclose
import pandas as pd
import numpy as np
from argopt import argopt
logger = logging.getLogger()
logger.setLevel(logging.INFO)
FIXED_VALUES = {
"filiere": ["LGT", "LP", "MILO", "CFA", "EMS", "PJJ", "SHN", "Actif"],
"residence": ["Ardennes", "Puy-de-Dôme", "Cher", "Haute-Saône", "Morbihan", "Eure", "Haute-Pyrénées",
"Loire-Atlantique",
"Vaucluse", "Guyane", "Nord", "Creuse", "Val d'Oise"],
"sexe": ["F", "M"]
}
def generate_syn_data(output_path, n_persons, proportions):
dict_values = {}
for col in ["filiere", "residence", "sexe"]:
dict_values[col] = np.random.choice(FIXED_VALUES[col], n_persons, p=proportions[col])
df = pd.DataFrame(dict_values, index=np.arange(n_persons))
df.to_csv(output_path, encoding="utf8")
pass
if __name__ == '__main__':
parser = argopt(__doc__).parse_args()
output_path = parser.o
n_persons = parser.n
proportions = {}
proportions["filiere"] = parser.f
proportions["residence"] = parser.r
proportions["sexe"] = parser.s
for value, proportion in proportions.items():
# TODO check if any of these values asigned it is None
if proportion:
filiere_prop = np.array(list(map(float, proportion.split(","))))
if len(filiere_prop) != len(FIXED_VALUES[value]) or not isclose(sum(filiere_prop), 1.):
logger.error(
"{0} proportion was indicated but does not match the nummber of {1} available.".format(value, value),
"Or it is not a valid prob distribution (!= 1.0)",
"Using the same proportion for each one.")
proportions[value] = np.ones(len(FIXED_VALUES[value])) / len(FIXED_VALUES[value])
else:
proportions[value] = filiere_prop
else:
proportions[value] = np.ones(len(FIXED_VALUES[value])) / len(FIXED_VALUES[value])
generate_syn_data(output_path=output_path, n_persons=n_persons, proportions=proportions)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment