Skip to content

Instantly share code, notes, and snippets.

@meunomemauricio
Last active January 25, 2017 14:23
Show Gist options
  • Save meunomemauricio/743522121e1040759e0ec196c744a16a to your computer and use it in GitHub Desktop.
Save meunomemauricio/743522121e1040759e0ec196c744a16a to your computer and use it in GitHub Desktop.
A small web scraper to track packages from the Correios - SRO (Sistema de Rastreamento de Objetos) Website.
#! /usr/bin/python3
"""Scrape the correios website for package tracking."""
import argparse
import http.client
import logging
import re
import requests
import sys
from bs4 import BeautifulSoup
URL = 'http://websro.correios.com.br/sro_bin/txect01$.QueryList'
CODE_PATTERN = re.compile(r'\w{2}\d{9}\w{2}')
class Package():
def __init__(self, code, last_status):
self.code = code
self.last_status = last_status
def __str__(self):
return '{}: {}'.format(self.code, self.last_status)
def parse_arguments():
parser = argparse.ArgumentParser(description=__doc__)
arg_help = 'Tracking Code. Cans specify more than one separated by ";"'
parser.add_argument('code', help=arg_help)
parser.add_argument('-d', '--debug', action='store_true', default=False,
help='Debug HTTP requests')
return parser.parse_args()
def enable_debug():
http.client.HTTPConnection.debuglevel = 1
# You must initialize logging, otherwise you'll not see debug output.
logging.basicConfig()
logging.getLogger().setLevel(logging.DEBUG)
requests_log = logging.getLogger("requests.packages.urllib3")
requests_log.setLevel(logging.DEBUG)
requests_log.propagate = True
def code_format_validation(code):
codes = code.split(';')
for code in codes:
if not CODE_PATTERN.match(code):
print('Code must be in the format "SS987654321XX"')
sys.exit(1)
def request_page(code):
"""Request the package information page and return its contents."""
codes = code.split(';')
tipo = '001' if len(codes) == 1 else '003'
code_type = 'P_COD_UNI' if len(codes) == 1 else 'P_COD_LIS'
data = {
code_type: code,
'P_LINGUA': '001',
'Z_ACTION': 'Search',
'P_TIPO': tipo,
}
r = requests.post(URL, data=data)
return r.text
def parse_table(page):
"""Parse the html page and return a list of Packages.
Only the last status of the package is considered.
"""
soup = BeautifulSoup(page, 'html.parser')
table_type = _get_table_type(soup)
if table_type == '001':
return _parse_single_table(soup)
elif table_type == '003':
return _parse_list_table(soup)
raise RuntimeError('Invalid Table Type: {}'.format(table_type))
def _get_table_type(soup):
"""Get the value of the INPUT tag with name equals P_TIPO"""
return soup.find(attrs={'name': 'P_TIPO'})['value']
def _parse_single_table(soup):
"""Parse the table when consulting a single."""
code = soup.find(attrs={'name': 'P_ITEMCODE'})['value']
for font in soup.table.find_all('font'):
if not font.has_attr('face'):
last_status = font.string
break
return [Package(code, last_status)]
def _parse_list_table(soup):
"""Parse the table when consulting multiple objects."""
def _filter_tr(tag):
"""TR tags that contain an A tag as child."""
return tag.name == 'tr' and tag.find('a')
pkgs = []
for tr in soup.table.find_all(_filter_tr):
code = tr.find('a').string
last_status = tr.find('font').string
pkgs.append(Package(code, last_status))
return pkgs
def main():
args = parse_arguments()
if args.debug:
enable_debug()
code_format_validation(args.code)
page = request_page(args.code)
pkgs = parse_table(page)
for pkg in pkgs:
print(pkg)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment