Created
March 14, 2014 13:19
-
-
Save gustavofonseca/9547538 to your computer and use it in GitHub Desktop.
script utilizado na análise de preenchimento dos campos dos registros de fascículo da rede SciELO.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2.7 | |
#coding: utf-8 | |
import tablib | |
def get_dataset(filepath): | |
skipped_rows = 0 | |
valid_rows = 0 | |
ds = tablib.Dataset(headers=['title', 'pubdate', 'volume', | |
'number', 'suppl_volume', 'suppl_number']) | |
with open(filepath) as fp: | |
for row in fp: | |
data = row.strip().split('|')[:6] | |
kwargs = {} | |
if len(data) != 6: | |
skipped_rows += 1 | |
continue | |
else: | |
if data[2]: | |
tags = kwargs.setdefault('tags', []) | |
tags.append('volume') | |
if data[3]: | |
tags = kwargs.setdefault('tags', []) | |
tags.append('number') | |
if data[4]: | |
tags = kwargs.setdefault('tags', []) | |
tags.append('suppl_volume') | |
if data[5]: | |
tags = kwargs.setdefault('tags', []) | |
tags.append('suppl_number') | |
ds.append(data, **kwargs) | |
valid_rows += 1 | |
return ds, skipped_rows, valid_rows | |
def calc_column_occ(ds, total_rows, column): | |
total_column = sum((1 for _ in ds[column] if _)) | |
return (total_column * 100) / float(total_rows) | |
def calc_tag_occ_AND(ds, total_rows, tag_names): | |
""" | |
:param tag_names: list of tags | |
""" | |
tmp_ds = ds | |
for tag in tag_names: | |
tmp_ds = tmp_ds.filter(tag) | |
total_column = sum((1 for _ in tmp_ds if _)) | |
return (total_column * 100) / float(total_rows) | |
def calc_tag_occ_OR(ds, total_rows, tag_names): | |
""" | |
:param tag_names: list of tags | |
""" | |
tmp_ds = ds.filter(tag_names) | |
total_column = sum((1 for _ in tmp_ds if _)) | |
return (total_column * 100) / float(total_rows) | |
if __name__ == '__main__': | |
ds, skipped_rows, valid_rows = get_dataset('issues_rede_journal.txt') | |
print u'--------------------------------' | |
print u'--> Total de tuplas inválidas:', skipped_rows | |
print u'--> Total de tuplas válidas:', valid_rows | |
print u'' | |
print u'--> Ocorrência do atributo "title":', calc_column_occ(ds, valid_rows, 'title'), '%' | |
print u'--> Ocorrência do atributo "pubdate":', calc_column_occ(ds, valid_rows, 'pubdate'), '%' | |
print u'--> Ocorrência do atributo "volume":', calc_column_occ(ds, valid_rows, 'volume'), '%' | |
print u'--> Ocorrência do atributo "number":', calc_column_occ(ds, valid_rows, 'number'), '%' | |
print u'--> Ocorrência do atributo "suppl_volume":', calc_column_occ(ds, valid_rows, 'suppl_volume'), '%' | |
print u'--> Ocorrência do atributo "suppl_number":', calc_column_occ(ds, valid_rows, 'suppl_number'), '%' | |
print u'' | |
print u'--> Ocorrência dos atributos "volume AND number":', calc_tag_occ_AND(ds, valid_rows, ['volume', 'number']), '%' | |
print u'--> Ocorrência dos atributos "volume OR number":', calc_tag_occ_OR(ds, valid_rows, ['volume', 'number']), '%' | |
print u'' | |
print u'--> Ocorrência dos atributos "volume AND suppl_volume":', calc_tag_occ_AND(ds, valid_rows, ['volume', 'suppl_volume']), '%' | |
print u'--> Ocorrência dos atributos "volume AND suppl_number":', calc_tag_occ_AND(ds, valid_rows, ['volume', 'suppl_number']), '%' | |
print u'--> Ocorrência dos atributos "volume AND number AND suppl_number":', calc_tag_occ_AND(ds, valid_rows, ['volume', 'number', 'suppl_number']), '%' | |
print u'' | |
print u'--> Ocorrência dos atributos "number AND suppl_number":', calc_tag_occ_AND(ds, valid_rows, ['number', 'suppl_number']), '%' | |
print u'--> Ocorrência dos atributos "number AND suppl_volume":', calc_tag_occ_AND(ds, valid_rows, ['number', 'suppl_volume']), '%' | |
print u'--> Ocorrência dos atributos "number AND volume AND suppl_volume":', calc_tag_occ_AND(ds, valid_rows, ['number', 'volume', 'suppl_volume']), '%' | |
print u'' | |
print u'--> Ocorrência dos atributos "suppl_volume AND suppl_number":', calc_tag_occ_AND(ds, valid_rows, ['suppl_volume', 'suppl_number']), '%' | |
print u'--------------------------------' | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment