Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save quis/1fd2635c46f05b227af5b11dfe48b0d1 to your computer and use it in GitHub Desktop.
Save quis/1fd2635c46f05b227af5b11dfe48b0d1 to your computer and use it in GitHub Desktop.
import re
import requests
from bs4 import BeautifulSoup
from collections import OrderedDict
from itertools import chain
from io import BytesIO
from tika import parser
from joblib import Memory
location = './cachedir'
memory = Memory(location, verbose=0)
def normalise_url(url):
if not url.startswith("http"):
return "https://www.gov.uk" + url
return url
@memory.cache
def _get_content(url, as_bs=False):
response = requests.get(normalise_url(url))
if as_bs:
return BeautifulSoup(response.content, features="html.parser")
else:
return response.content
@memory.cache
def get_pdf_content(url):
pdf_content = _get_content(url)
raw = parser.from_buffer(BytesIO(pdf_content))
return raw
def _sanitise_name(consulate_name):
pattern = re.compile("consular fees", re.IGNORECASE)
consulate_name = pattern.sub("", consulate_name)
pattern = re.compile("consular fee", re.IGNORECASE)
consulate_name = pattern.sub("", consulate_name)
consulate_name = consulate_name.replace(":", "")
consulate_name = consulate_name.replace("–", "")
consulate_name = consulate_name.replace("-", "")
consulate_name = consulate_name.replace("2019", "")
return consulate_name.strip()
def get_fees_from_content(raw):
prev = 0
for line in raw['content'].split():
if len(line) > 0 and (
line[0].isdigit() or re.match('^\(i+\)', line.strip())
):
chunks = line.split(" ")
if '.' in chunks[0] or ',' in chunks[0] or '/' in chunks[0]:
continue
if re.match('^\(i+\)', line.strip()):
yield "{} {}".format(prev, line.strip())
else:
try:
i = int(chunks[0])
if i < 25 and i > prev:
prev = i
except ValueError:
pass
yield prev
def ordinal_fee_type(fee_type):
fee_type_num = float(fee_type.split(' ')[0])
if 'iii' in fee_type:
return fee_type_num + 0.3
if 'ii' in fee_type:
return fee_type_num + 0.2
if 'i' in fee_type:
return fee_type_num + 0.1
return fee_type_num
page = _get_content("/government/collections/consular-fees", as_bs=True)
all_fees = OrderedDict()
for link in page.select('[data-track-category=navDocumentCollectionLinkClicked]'):
page_url = "https://www.gov.uk" + link['href']
page_with_pdf_link = _get_content(page_url, as_bs=True)
pdf_link = page_with_pdf_link.select_one('.attachment-details a')
raw = get_pdf_content(pdf_link['href'])
fees = list(get_fees_from_content(raw))
filtered_fees = list(filter(
lambda x: '{} (i)'.format(x) not in fees,
fees
))
deduped_fees = OrderedDict.fromkeys(filtered_fees)
all_fees[
(
_sanitise_name(link.text),
normalise_url(pdf_link['href']),
)
] = list(map(str, deduped_fees))
all_possible_fees = OrderedDict.fromkeys(chain.from_iterable(all_fees.values()))
all_possible_fees = sorted(all_possible_fees, key=ordinal_fee_type)
print('|'.join(['country'] + all_possible_fees))
for country, country_fees in all_fees.items():
country, pdf_link = country
print('=HYPERLINK("{}", "{}")|{}'.format(pdf_link, country, '|'.join([
'y' if fee in country_fees else 'n' for fee in all_possible_fees
])))
beautifulsoup4==4.8.1
tika==1.22
joblib==0.14.0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment