Skip to content

Instantly share code, notes, and snippets.

@junqueira
Created August 9, 2016 02:41
Show Gist options
  • Save junqueira/86d2f7f9ef7572656acf69c3f00c2240 to your computer and use it in GitHub Desktop.
Save junqueira/86d2f7f9ef7572656acf69c3f00c2240 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import os
import unicodedata
from collections import defaultdict, Counter
from operator import itemgetter, attrgetter
local = os.path.dirname(__file__)
file_imput = local + '/natural_keys_scl.csv'
file_out = local + '/natural_keys_output.csv'
def remove_accents(input_str):
nfkd_form = unicodedata.normalize('NFKD', input_str)
only_ascii = str(nfkd_form.encode('ASCII', 'ignore'), 'utf-8')
only_ascii = only_ascii.replace('\n', '')
return only_ascii
def getDataFile(file= file_imput):
file = open(file, 'r')
contents = file.readlines()
line = 1
list_data = []
while line < len(contents):
record ={}
num_cell = 0
for cell in contents[0].split(','):
cell = remove_accents(input_str= cell)
record[cell] = str(contents[line]).split(',')[num_cell].replace('\n','')
num_cell += 1
line += 1
list_data.append(record)
return list_data
def group_dict(lista=[], chave=''):
clave_natural = itemgetter(chave)
d = defaultdict(list)
for record in dataFile:
key = clave_natural(record)
d[key].append(record)
return d
def count_record(group=[], unique=True):
for key in group.keys():
count_record = len(group[key])
if count_record != 1 and not unique:
print('chave_natural %s exist %s record' %(key, count_record))
elif unique:
print('chave_natural %s unique record' %(key))
def saveFile(lista= [], file= file_out):
file = open(file, "w")
[file.write(str(record)) for record in lista]
file.close()
dataFile = getDataFile(file= file_imput)
filter = [record for record in dataFile if record['first page'] == '' or record['last page'] == '']
group = group_dict(lista= dataFile, chave= 'chave natural')
count_record(group= group, unique=True)
count_record(group= group, unique=False)
saveFile(lista= filter, file= file_out)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment