Skip to content

Instantly share code, notes, and snippets.

@meunomemauricio
Last active January 25, 2017 14:37
Show Gist options
  • Save meunomemauricio/e28dcc135ceaf2311ff4444b46839068 to your computer and use it in GitHub Desktop.
Save meunomemauricio/e28dcc135ceaf2311ff4444b46839068 to your computer and use it in GitHub Desktop.
Webscrapes Sodexo website and export it as a CSV file.

Sodexo Exporter

Sodexo is a company that provides Meal Allowance for other companies employees, here in Brazil.

This tool scrapes the Sodexo website and provides the account transaction history in a CSV file.

The captcha is not bypassed. The image is displayed throgh ImageMagick and the user has to type the text.

Dependencies

It's necessary to install Beautiful Soup 4.x, Request and Pillow through PIP.

To display the captcha images, install ImageMagick:

sudo apt-get install imagemagick
#! /usr/bin/python
# -*- coding: utf-8 -*-
import argparse
import csv
import codecs
import cStringIO
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from PIL import Image
from StringIO import StringIO
SODEXO_URL = 'https://sodexosaldocartao.com.br/saldocartao/consultaSaldo.do'
SETUP_URL = '{0}?operation=setUp'.format(SODEXO_URL)
POST_URL = '{0}?operation=consult'.format(SODEXO_URL)
CAPTCHA_URL = 'https://sodexosaldocartao.com.br/saldocartao/jcaptcha.do'
CSV_HEADER = [('Date', 'Payee', 'Category', 'Memo', 'Outflow', 'Inflow')]
PAYEE = 'Restaurante'
CATEGORY = 'Sodexo'
class AuthenticationError(Exception):
"""Trying to login with wrong credentials."""
class UnicodeCSVWriter(object):
"""A CSV writer with different encoding."""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
data = self.queue.getvalue()
data = data.decode("utf-8")
data = self.encoder.encode(data)
self.stream.write(data)
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
class SodexoScraper(object):
def __init__(self, card_number, cpf):
self.session = requests.Session()
self.card_number = card_number
self.cpf = cpf
def _parse_html(self, html):
"""Parse the HTML table and return content as a Python object."""
soup = BeautifulSoup(html, 'lxml', from_encoding='utf-8')
print 'Encoding: {0}'.format(soup.original_encoding)
msg_return = soup.find(id='msgRetorno')
if msg_return is not None:
raise AuthenticationError(msg_return.text.strip())
# Parse table
entries = []
balance_table = soup.find(id='gridSaldo')
for row in balance_table.findAll('tr'):
entries.append([_.text for _ in row.findAll('td')])
entries = [_ for _ in entries if _]
def translate_entry(date, value, xtype, auth, memo):
outflow = ''
inflow = ''
if xtype.startswith('C'):
inflow = value
else:
outflow = value
return date, PAYEE, CATEGORY, memo, outflow, inflow
return [translate_entry(*_) for _ in entries]
def _prompt_captcha(self):
"""Get the Captcha image and prompts the user."""
r = self.session.get(CAPTCHA_URL)
captcha_image = Image.open(StringIO(r.content))
captcha_image.show()
return raw_input('CAPTCHA: ')
def _post_card(self, captcha_text):
"""Post the Card Information."""
post_data = {
'service': '5;1;6',
'cardNumber': self.card_number,
'cpf': self.cpf,
'hiddenField': captcha_text,
}
r = self.session.post(POST_URL, params=post_data)
return r.content
def get_transaction_history(self):
"""Scrape the website and get Transaction History."""
# Setup Session before doing the rest
self.session.get(SETUP_URL)
captcha_resp = self._prompt_captcha()
response = self._post_card(captcha_resp)
return self._parse_html(response)
def save_csv_data(self, filename):
"""Export Transaction History as CSV File."""
data = self.get_transaction_history()
with open(filename, 'w') as fd:
csvwriter = UnicodeCSVWriter(fd, delimiter=',', quotechar='"')
csvwriter.writerows(CSV_HEADER + data)
def parse_arguments():
"""Parse Command Arguments."""
parser = argparse.ArgumentParser()
parser.add_argument('card_number', help='Sodexo Card Number.')
parser.add_argument('cpf', help='User CPF.')
return parser.parse_args()
def main():
"""Main Routine."""
args = parse_arguments()
sdx = SodexoScraper(args.card_number, args.cpf)
fname = datetime.now().strftime('sodexo_saldo_%Y%m%d.csv')
sdx.save_csv_data(fname)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment