Skip to content

Instantly share code, notes, and snippets.

@gregmacfarlane
Created May 10, 2018 17:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gregmacfarlane/8b19ac311420dcef8bf44aaac289996c to your computer and use it in GitHub Desktop.
Save gregmacfarlane/8b19ac311420dcef8bf44aaac289996c to your computer and use it in GitHub Desktop.
Test doppelganger on different-sized marginals
import csv
import os
os.chdir('/Users/gregmacfarlane/tf/doppelganger/examples')
import pandas as pd
import logging
log = logging.getLogger(__name__)
logging.basicConfig(level=logging.DEBUG)
from doppelganger import (
allocation,
inputs,
Configuration,
HouseholdAllocator,
PumsData,
SegmentedData,
BayesianNetworkModel,
Population,
Preprocessor,
Marginals
)
STATE = '06'
PUMA = '00106'
output_dir = ".test"
log.info("Loading configuration and data")
configuration = Configuration.from_file('sample_data/config.json')
preprocessor = Preprocessor.from_config(configuration.preprocessing_config)
household_fields = tuple(set(
field.name for field in allocation.DEFAULT_HOUSEHOLD_FIELDS).union(
set(configuration.household_fields)
))
households_data = PumsData.from_csv('sample_data/households_00106_dirty.csv').clean(
household_fields, preprocessor, puma=PUMA
)
persons_fields = tuple(set(
field.name for field in allocation.DEFAULT_PERSON_FIELDS).union(
set(configuration.person_fields)
))
persons_data = PumsData.from_csv('sample_data/persons_00106_dirty.csv').clean(
persons_fields, preprocessor, puma=PUMA
)
log.info("Loading model")
person_segmentation = lambda x: x[inputs.AGE.name]
person_training_data = SegmentedData.from_data(
persons_data,
list(configuration.person_fields),
inputs.PERSON_WEIGHT.name,
person_segmentation
)
person_model = BayesianNetworkModel.train(
person_training_data,
configuration.person_structure,
configuration.person_fields
)
household_segmenter = lambda x: x[inputs.NUM_PEOPLE.name]
household_training_data = SegmentedData.from_data(
households_data,
list(configuration.household_fields),
inputs.HOUSEHOLD_WEIGHT.name,
household_segmenter,
)
household_model = BayesianNetworkModel.train(
household_training_data,
configuration.household_structure,
configuration.household_fields
)
def generate_population(marginals_file):
controls = Marginals.from_csv(marginals_file)
allocator = HouseholdAllocator.from_cleaned_data(controls, households_data, persons_data)
population = Population.generate(
allocator, person_model, household_model
)
log.info("%s \t \t %d \t %d \t %d", marginals_file,
households_data.data['household_weight'].sum(),
controls.data['num_people_count'].sum(),
population.generated_households['household_id'].count())
log.info("File \t \t PUMS \t Controls \t Generated")
generate_population('sample_data/marginals_00106.csv')
generate_population('sample_data/marginals_00106_modified.csv')
STATEFP COUNTYFP PUMA5CE TRACTCE num_people_count num_people_1 num_people_3 num_people_2 num_people_4+ num_vehicles_1 num_vehicles_0 num_vehicles_2 num_vehicles_3+ age_0-17 age_18-34 age_65+ age_35-64
0 06 001 00106 430101 2217 305 356 648 908 270 45 1057 2076 1756 1161 671 3383
1 06 001 00106 430102 863 158 174 407 124 127 8 450 398 354 210 553 1022
2 06 001 00106 430200 2417 397 580 936 504 211 16 1380 1466 1310 911 1114 3324
3 06 001 00106 430300 1239 222 194 466 357 118 59 548 906 845 492 672 1597
4 06 001 00106 430400 752 136 150 294 172 181 6 308 661 311 331 416 997
5 06 001 00106 430500 2027 473 389 614 551 443 81 1283 873 1458 1264 582 2889
6 06 001 00106 430600 2145 496 477 699 473 425 0 1010 1421 988 1120 876 2755
7 06 001 00106 430700 1291 165 265 470 391 128 0 760 934 911 830 523 1649
8 06 001 00106 430800 2116 367 328 793 628 522 128 1335 909 1406 1377 896 2412
9 06 001 00106 430900 1822 499 356 589 378 724 40 893 467 1449 1186 587 1799
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment