Skip to content

Instantly share code, notes, and snippets.

@gustavofonseca
Created March 14, 2014 13:19
Show Gist options
  • Save gustavofonseca/9547538 to your computer and use it in GitHub Desktop.
Save gustavofonseca/9547538 to your computer and use it in GitHub Desktop.
script utilizado na análise de preenchimento dos campos dos registros de fascículo da rede SciELO.
#!/usr/bin/env python2.7
#coding: utf-8
import tablib
def get_dataset(filepath):
skipped_rows = 0
valid_rows = 0
ds = tablib.Dataset(headers=['title', 'pubdate', 'volume',
'number', 'suppl_volume', 'suppl_number'])
with open(filepath) as fp:
for row in fp:
data = row.strip().split('|')[:6]
kwargs = {}
if len(data) != 6:
skipped_rows += 1
continue
else:
if data[2]:
tags = kwargs.setdefault('tags', [])
tags.append('volume')
if data[3]:
tags = kwargs.setdefault('tags', [])
tags.append('number')
if data[4]:
tags = kwargs.setdefault('tags', [])
tags.append('suppl_volume')
if data[5]:
tags = kwargs.setdefault('tags', [])
tags.append('suppl_number')
ds.append(data, **kwargs)
valid_rows += 1
return ds, skipped_rows, valid_rows
def calc_column_occ(ds, total_rows, column):
total_column = sum((1 for _ in ds[column] if _))
return (total_column * 100) / float(total_rows)
def calc_tag_occ_AND(ds, total_rows, tag_names):
"""
:param tag_names: list of tags
"""
tmp_ds = ds
for tag in tag_names:
tmp_ds = tmp_ds.filter(tag)
total_column = sum((1 for _ in tmp_ds if _))
return (total_column * 100) / float(total_rows)
def calc_tag_occ_OR(ds, total_rows, tag_names):
"""
:param tag_names: list of tags
"""
tmp_ds = ds.filter(tag_names)
total_column = sum((1 for _ in tmp_ds if _))
return (total_column * 100) / float(total_rows)
if __name__ == '__main__':
ds, skipped_rows, valid_rows = get_dataset('issues_rede_journal.txt')
print u'--------------------------------'
print u'--> Total de tuplas inválidas:', skipped_rows
print u'--> Total de tuplas válidas:', valid_rows
print u''
print u'--> Ocorrência do atributo "title":', calc_column_occ(ds, valid_rows, 'title'), '%'
print u'--> Ocorrência do atributo "pubdate":', calc_column_occ(ds, valid_rows, 'pubdate'), '%'
print u'--> Ocorrência do atributo "volume":', calc_column_occ(ds, valid_rows, 'volume'), '%'
print u'--> Ocorrência do atributo "number":', calc_column_occ(ds, valid_rows, 'number'), '%'
print u'--> Ocorrência do atributo "suppl_volume":', calc_column_occ(ds, valid_rows, 'suppl_volume'), '%'
print u'--> Ocorrência do atributo "suppl_number":', calc_column_occ(ds, valid_rows, 'suppl_number'), '%'
print u''
print u'--> Ocorrência dos atributos "volume AND number":', calc_tag_occ_AND(ds, valid_rows, ['volume', 'number']), '%'
print u'--> Ocorrência dos atributos "volume OR number":', calc_tag_occ_OR(ds, valid_rows, ['volume', 'number']), '%'
print u''
print u'--> Ocorrência dos atributos "volume AND suppl_volume":', calc_tag_occ_AND(ds, valid_rows, ['volume', 'suppl_volume']), '%'
print u'--> Ocorrência dos atributos "volume AND suppl_number":', calc_tag_occ_AND(ds, valid_rows, ['volume', 'suppl_number']), '%'
print u'--> Ocorrência dos atributos "volume AND number AND suppl_number":', calc_tag_occ_AND(ds, valid_rows, ['volume', 'number', 'suppl_number']), '%'
print u''
print u'--> Ocorrência dos atributos "number AND suppl_number":', calc_tag_occ_AND(ds, valid_rows, ['number', 'suppl_number']), '%'
print u'--> Ocorrência dos atributos "number AND suppl_volume":', calc_tag_occ_AND(ds, valid_rows, ['number', 'suppl_volume']), '%'
print u'--> Ocorrência dos atributos "number AND volume AND suppl_volume":', calc_tag_occ_AND(ds, valid_rows, ['number', 'volume', 'suppl_volume']), '%'
print u''
print u'--> Ocorrência dos atributos "suppl_volume AND suppl_number":', calc_tag_occ_AND(ds, valid_rows, ['suppl_volume', 'suppl_number']), '%'
print u'--------------------------------'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment