denilsonsa/moneylog_convert.py

## moneylog_convert.py
#!/usr/bin/env python2
# -*- coding: utf8 -*-

from __future__ import unicode_literals

import sys
from datetime import date, timedelta
import re


# Some useful variables
current_date = date(2009, 7, 1)
line_count = 0
line_count_in_this_day = 0


# Let's suppose the input and the output are both in UTF-8, without Byte-Order-Mark
# Might be useful to run this at the input file:
#   :set fenc=utf8
#   :set nobomb
#   :set ff=unix
input_file = sys.stdin
output_file = sys.stdout

output_file.write('# vi:filetype=moneylog\n')


# Some regular expressions
re_isodate = re.compile(r'^(?P<year>\d\d\d\d)-(?P<month>\d\d)-(?P<day>\d\d)$')
re_dia = re.compile(r'^(?P<day>segunda|ter[çc]a|quarta|quinta|sexta|s[áa]bado|domingo) *(?P<date>\d+) *$', re.I)
re_custo = re.compile(r'^(?P<sinal>[-+]?)(?P<valor>[0-9.,]+) *(?P<nome>.*)$')
re_pagamento = re.compile(r'^(?P<forma>din|d[eé]b|cr[eé]d?) (?P<resto>.*)$', re.I)
forma_pagamento_tags = [
    (re.compile(r'^din$'     , re.I), ['dinheiro']),
    (re.compile(r'^d[eé]b$'  , re.I), ['debito']),
    (re.compile(r'^cr[eé]d?$', re.I), ['nubank']),
]

re_ricos_lanches = re.compile(r'(no )?rico.?s( lanches?)?', re.I)


# Main loop here
for line in input_file:
    line_count += 1
    line_count_in_this_day += 1

    # Empty line
    if line.strip() == b'':
        output_file.write(line)
        continue

    line_strip = line.strip()

    # Line with just an ISO date
    match = re_isodate.search(line_strip)
    if match:
        line_count_in_this_day = 0
        current_date = date(
            int(match.group('year'), 10),
            int(match.group('month'), 10),
            int(match.group('day'), 10)
        )
        continue

    # New day
    match = re_dia.search(line_strip.decode('utf8'))
    if match:
        line_count_in_this_day = 0
        new_day = int(match.group('date'), 10)
        # Check for month change
        if new_day < current_date.day:
            # Check for year change
            if current_date.month == 12:
                current_date = date(current_date.year + 1, 1, new_day)
            else:
                current_date = date(current_date.year, current_date.month + 1, new_day)
        else:
            current_date = current_date.replace(day=new_day)
        continue

    # Some value
    match = re_custo.search(line_strip)
    if match:
        sinal = match.group('sinal')
        if sinal == '':
            sinal = '-'

        valor = match.group('valor').replace(',', '.')
        # valor_int 1090  <--  valor = '10.90'
        valor_int = int(valor.replace('.',''), 10)

        tags = []

        nome = match.group('nome').strip().decode('utf8')

        forma_pagamento = re_pagamento.search(nome)
        if forma_pagamento:
            forma = forma_pagamento.group('forma')
            nome = forma_pagamento.group('resto')
            for forma_re, forma_tags in forma_pagamento_tags:
                if forma_re.match(forma):
                    tags.extend(forma_tags)

        # :'<,'>Tabularize /,
        nome = re.sub('\\bsubway\\b'                   , 'Subway'                , nome , flags=re.I)
        nome = re.sub('\\bBurgu?er King\\b'            , 'Burger King'           , nome , flags=re.I)
        nome = re.sub('\\bcarrefour\\b'                , 'Carrefour'             , nome , flags=re.I)
        nome = re.sub('\\bnas (lojas)? americanas\\b'  , 'nas Lojas Americanas'  , nome , flags=re.I)
        nome = re.sub('\\blojas americanas\\b'         , 'Lojas Americanas'      , nome , flags=re.I)
        nome = re.sub("\\bhabib'?s\\b"                 , "Habib's"               , nome , flags=re.I)
        nome = re.sub("\\bbob'?s\\b"                   , "Bob's"                 , nome , flags=re.I)

        nome = re.sub('\\bsanduiche\\b'                , 'sanduíche'             , nome , flags=re.I)
        nome = re.sub('\\bt[aá]xi\\b'                  , 'táxi'                  , nome , flags=re.I)
        nome = re.sub('\\b[oô]nibus\\b'                , 'ônibus'                , nome , flags=re.I)

        nome = nome[0:1].upper() + nome[1:]

        if re.search('\\bTáxi\\b', nome, re.I):
            tags.append('taxi')

        for onibus in ('SC01', 'SC02', 'SC03', 'SC04'):
            if nome.upper().startswith(onibus):
                tags.append('onibus')

        if re.search('\\bÔnibus\\b', nome, re.I):
            tags.append('onibus')

        if re.search("\\b(pizza|subway|sanduíches?|cachorro quente|hot dog|hambúrguer|calabresa|salgado|milkshake|almoço|jantar|lanches?|habib's|bob's|pão|pães|padaria|banana|biscoito|leite|comida|prato feito|restaurante|sorvete|sorveteria|suco)\\b", nome, re.I):
            tags.append('alimentacao')

        if re.search("\\brecarga\\b.*\\bcelular\\b", nome, re.I):
            tags.append('telefone')

        if re.search("\\bpassagem\\b.*\\bviação\\b", nome, re.I):
            tags.append('viagem')

        if re.search("\\btáxi\\b.*\\b(rodoviária|aeroporto)\\b", nome, re.I):
            tags.append('viagem')

        if re.search('\\b(gog(\\.com)?|steam|humble ?bundle|google play)\\b', nome, re.I):
            tags.append('compras')

        if re.search('\\b(estacionamento)\\b', nome, re.I):
            tags.append('outros')

        # Removing duplicates while preserving the order.
        old_tags = tags
        tags = []
        for tag in old_tags:
            if not tag in tags:
                tags.append(tag)

        # Finally, the almighty output!
        output_file.write(
            (
                current_date.isoformat() + '\t' +
                ('%s%7s\t' % (sinal, valor)) +
                # Using set(tags) just to remove duplicates
                ('%s| %s\n' % (','.join(tags), nome) )
            ).encode('utf8')
        )
        continue

    # Something else, any non-recognized line
    output_file.write(line)

## sample_input.txt
2016-06-13
Terça 14
+50.00 mesada
Quarta 15
10.00 sanduiche no subway
35.00 taxi
	#!/usr/bin/env python2
	# -- coding: utf8 --

	from __future__ import unicode_literals

	import sys
	from datetime import date, timedelta
	import re


	# Some useful variables
	current_date = date(2009, 7, 1)
	line_count = 0
	line_count_in_this_day = 0


	# Let's suppose the input and the output are both in UTF-8, without Byte-Order-Mark
	# Might be useful to run this at the input file:
	# :set fenc=utf8
	# :set nobomb
	# :set ff=unix
	input_file = sys.stdin
	output_file = sys.stdout

	output_file.write('# vi:filetype=moneylog\n')


	# Some regular expressions
	re_isodate = re.compile(r'^(?P<year>\d\d\d\d)-(?P<month>\d\d)-(?P<day>\d\d)$')
	re_dia = re.compile(r'^(?P<day>segunda\|ter[çc]a\|quarta\|quinta\|sexta\|s[áa]bado\|domingo) (?P<date>\d+) $', re.I)
	re_custo = re.compile(r'^(?P<sinal>[-+]?)(?P<valor>[0-9.,]+) (?P<nome>.)$')
	re_pagamento = re.compile(r'^(?P<forma>din\|d[eé]b\|cr[eé]d?) (?P<resto>.*)$', re.I)
	forma_pagamento_tags = [
	(re.compile(r'^din$' , re.I), ['dinheiro']),
	(re.compile(r'^d[eé]b$' , re.I), ['debito']),
	(re.compile(r'^cr[eé]d?$', re.I), ['nubank']),
	]

	re_ricos_lanches = re.compile(r'(no )?rico.?s( lanches?)?', re.I)


	# Main loop here
	for line in input_file:
	line_count += 1
	line_count_in_this_day += 1

	# Empty line
	if line.strip() == b'':
	output_file.write(line)
	continue

	line_strip = line.strip()

	# Line with just an ISO date
	match = re_isodate.search(line_strip)
	if match:
	line_count_in_this_day = 0
	current_date = date(
	int(match.group('year'), 10),
	int(match.group('month'), 10),
	int(match.group('day'), 10)
	)
	continue

	# New day
	match = re_dia.search(line_strip.decode('utf8'))
	if match:
	line_count_in_this_day = 0
	new_day = int(match.group('date'), 10)
	# Check for month change
	if new_day < current_date.day:
	# Check for year change
	if current_date.month == 12:
	current_date = date(current_date.year + 1, 1, new_day)
	else:
	current_date = date(current_date.year, current_date.month + 1, new_day)
	else:
	current_date = current_date.replace(day=new_day)
	continue

	# Some value
	match = re_custo.search(line_strip)
	if match:
	sinal = match.group('sinal')
	if sinal == '':
	sinal = '-'

	valor = match.group('valor').replace(',', '.')
	# valor_int 1090 <-- valor = '10.90'
	valor_int = int(valor.replace('.',''), 10)

	tags = []

	nome = match.group('nome').strip().decode('utf8')

	forma_pagamento = re_pagamento.search(nome)
	if forma_pagamento:
	forma = forma_pagamento.group('forma')
	nome = forma_pagamento.group('resto')
	for forma_re, forma_tags in forma_pagamento_tags:
	if forma_re.match(forma):
	tags.extend(forma_tags)

	# :'<,'>Tabularize /,
	nome = re.sub('\\bsubway\\b' , 'Subway' , nome , flags=re.I)
	nome = re.sub('\\bBurgu?er King\\b' , 'Burger King' , nome , flags=re.I)
	nome = re.sub('\\bcarrefour\\b' , 'Carrefour' , nome , flags=re.I)
	nome = re.sub('\\bnas (lojas)? americanas\\b' , 'nas Lojas Americanas' , nome , flags=re.I)
	nome = re.sub('\\blojas americanas\\b' , 'Lojas Americanas' , nome , flags=re.I)
	nome = re.sub("\\bhabib'?s\\b" , "Habib's" , nome , flags=re.I)
	nome = re.sub("\\bbob'?s\\b" , "Bob's" , nome , flags=re.I)

	nome = re.sub('\\bsanduiche\\b' , 'sanduíche' , nome , flags=re.I)
	nome = re.sub('\\bt[aá]xi\\b' , 'táxi' , nome , flags=re.I)
	nome = re.sub('\\b[oô]nibus\\b' , 'ônibus' , nome , flags=re.I)

	nome = nome[0:1].upper() + nome[1:]

	if re.search('\\bTáxi\\b', nome, re.I):
	tags.append('taxi')

	for onibus in ('SC01', 'SC02', 'SC03', 'SC04'):
	if nome.upper().startswith(onibus):
	tags.append('onibus')

	if re.search('\\bÔnibus\\b', nome, re.I):
	tags.append('onibus')

	if re.search("\\b(pizza\|subway\|sanduíches?\|cachorro quente\|hot dog\|hambúrguer\|calabresa\|salgado\|milkshake\|almoço\|jantar\|lanches?\|habib's\|bob's\|pão\|pães\|padaria\|banana\|biscoito\|leite\|comida\|prato feito\|restaurante\|sorvete\|sorveteria\|suco)\\b", nome, re.I):
	tags.append('alimentacao')

	if re.search("\\brecarga\\b.*\\bcelular\\b", nome, re.I):
	tags.append('telefone')

	if re.search("\\bpassagem\\b.*\\bviação\\b", nome, re.I):
	tags.append('viagem')

	if re.search("\\btáxi\\b.*\\b(rodoviária\|aeroporto)\\b", nome, re.I):
	tags.append('viagem')

	if re.search('\\b(gog(\\.com)?\|steam\|humble ?bundle\|google play)\\b', nome, re.I):
	tags.append('compras')

	if re.search('\\b(estacionamento)\\b', nome, re.I):
	tags.append('outros')

	# Removing duplicates while preserving the order.
	old_tags = tags
	tags = []
	for tag in old_tags:
	if not tag in tags:
	tags.append(tag)

	# Finally, the almighty output!
	output_file.write(
	(
	current_date.isoformat() + '\t' +
	('%s%7s\t' % (sinal, valor)) +
	# Using set(tags) just to remove duplicates
	('%s\| %s\n' % (','.join(tags), nome) )
	).encode('utf8')
	)
	continue

	# Something else, any non-recognized line
	output_file.write(line)
	2016-06-13
	Terça 14
	+50.00 mesada
	Quarta 15
	10.00 sanduiche no subway
	35.00 taxi