Created
August 9, 2016 02:41
-
-
Save junqueira/86d2f7f9ef7572656acf69c3f00c2240 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import os | |
import unicodedata | |
from collections import defaultdict, Counter | |
from operator import itemgetter, attrgetter | |
local = os.path.dirname(__file__) | |
file_imput = local + '/natural_keys_scl.csv' | |
file_out = local + '/natural_keys_output.csv' | |
def remove_accents(input_str): | |
nfkd_form = unicodedata.normalize('NFKD', input_str) | |
only_ascii = str(nfkd_form.encode('ASCII', 'ignore'), 'utf-8') | |
only_ascii = only_ascii.replace('\n', '') | |
return only_ascii | |
def getDataFile(file= file_imput): | |
file = open(file, 'r') | |
contents = file.readlines() | |
line = 1 | |
list_data = [] | |
while line < len(contents): | |
record ={} | |
num_cell = 0 | |
for cell in contents[0].split(','): | |
cell = remove_accents(input_str= cell) | |
record[cell] = str(contents[line]).split(',')[num_cell].replace('\n','') | |
num_cell += 1 | |
line += 1 | |
list_data.append(record) | |
return list_data | |
def group_dict(lista=[], chave=''): | |
clave_natural = itemgetter(chave) | |
d = defaultdict(list) | |
for record in dataFile: | |
key = clave_natural(record) | |
d[key].append(record) | |
return d | |
def count_record(group=[], unique=True): | |
for key in group.keys(): | |
count_record = len(group[key]) | |
if count_record != 1 and not unique: | |
print('chave_natural %s exist %s record' %(key, count_record)) | |
elif unique: | |
print('chave_natural %s unique record' %(key)) | |
def saveFile(lista= [], file= file_out): | |
file = open(file, "w") | |
[file.write(str(record)) for record in lista] | |
file.close() | |
dataFile = getDataFile(file= file_imput) | |
filter = [record for record in dataFile if record['first page'] == '' or record['last page'] == ''] | |
group = group_dict(lista= dataFile, chave= 'chave natural') | |
count_record(group= group, unique=True) | |
count_record(group= group, unique=False) | |
saveFile(lista= filter, file= file_out) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment