Last active
June 24, 2023 16:35
-
-
Save blaggacao/d45a454d27556f41fef88833937088f1 to your computer and use it in GitHub Desktop.
Scrape Colombian CoA for ERPNext
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
This script scrapes the colombian chart of accounts in two variants from https://puc.com.co | |
https://puc.com.co is the most trusted online resource for CoA in Colombia. | |
Note on Account Types: | |
account types are mapped on the longest prefix in the 'account_types' dictionary below, | |
please modify as updates become necessary and rescrape the chart of accounts. | |
Usage: scrape_co_chart_of_accounts.py | |
Executions of this script takes aprox. 1 hour | |
""" | |
import csv | |
import json | |
import random | |
import time | |
from functools import partial, reduce | |
import requests | |
from bs4 import BeautifulSoup | |
headers = { | |
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36" | |
} | |
""" | |
These don't have a correspendence in the PUC: | |
Expenses Included In Asset Valuation | |
Expenses Included In Valuation | |
Asset Received But Not Billed | |
Service Received But Not Billed | |
Stock Received But Not Billed | |
Chargeable | |
Stock Adjustment | |
Temporary | |
Docs: | |
https://docs.erpnext.com/docs/v14/user/manual/en/accounts/chart-of-accounts#24-other-account-types | |
""" | |
account_types = { | |
"1105": "Cash", | |
"1110": "Bank", | |
"13": "Receivable", | |
"14": "Stock", | |
"1508": "Capital Work in Progress", | |
"1512": "Capital Work in Progress", | |
"1588": "Capital Work in Progress", | |
"1592": "Accumulated Depreciation", | |
"1597": "Accumulated Depreciation", | |
"15": "Fixed Asset", | |
"1699": "Accumulated Depreciation", | |
"1798": "Accumulated Depreciation", | |
"22": "Payable", | |
"23": "Payable", | |
"24": "Tax", | |
"3": "Equity", | |
"4": "Income Account", | |
"429581": "Round Off", | |
"5160": "Depreciation", | |
"5165": "Depreciation", | |
"5260": "Depreciation", | |
"5265": "Depreciation", | |
"54": "Tax", | |
"5": "Expense Account", | |
"6": "Cost of Goods Sold", | |
"7": "Cost of Goods Sold", | |
} | |
def op(acc, elem, root_type, cutoff=float("inf")): | |
new = elem.copy() | |
number = new["account_number"] | |
name = new["account_name"] | |
del new["account_name"] | |
def ischildof(i): | |
return ( | |
isinstance(i, dict) | |
and i.get("account_number") is not None | |
and number.startswith(i["account_number"]) | |
) | |
def recurse(racc, depth=1): | |
if depth > cutoff: | |
racc.update({"is_group": 1}) | |
return racc | |
for n, c in racc.items(): | |
if not ischildof(c): | |
continue | |
racc[n] = recurse(c, depth + 1) | |
return racc | |
racc[name] = new | |
if depth == 1: | |
racc[name]["root_type"] = root_type | |
return racc | |
acc.update(recurse(acc)) | |
return acc | |
def find_account_type(number): | |
acctypes = [(num, type) for (num, type) in account_types.items() if number.startswith(num)] | |
acctypes = sorted(acctypes, key=lambda i: i[0]) | |
return acctypes[0][1] if acctypes != [] else None | |
base_url = "https://puc.com.co" | |
full = {} | |
simple = {} | |
for (cl, root_type) in [ | |
(1, "Asset"), | |
(2, "Liability"), | |
(3, "Equity"), | |
(4, "Income"), | |
(5, "Expense"), | |
(6, "Expense"), | |
(7, "Expense"), | |
(8, "Asset"), | |
(9, "Liability"), | |
]: | |
rows = [] | |
time.sleep(5) | |
url = f"{base_url}/cuentas/clase/{cl}" | |
page = requests.get(url, headers=headers) | |
soup = BeautifulSoup(page.content, "html.parser") | |
accounts_list = soup.find_all("ul", class_="account-list")[1] | |
account_links = accounts_list.find_all("a") | |
rows = [] | |
for link in account_links: | |
(number, _, name) = link.text.strip().partition(" ") | |
acc_url = base_url + link["href"] | |
print(f"querying {acc_url}") | |
time.sleep(1 / random.randint(2, 15)) | |
acc_page = requests.get(acc_url, headers=headers) | |
acc_soup = BeautifulSoup(acc_page.content, "html.parser") | |
html_entry = acc_soup.find("div", class_="entry") | |
for desc in list(html_entry.descendants): | |
if desc.name == "aside": | |
desc.decompose() | |
if desc.name == "a" and desc["href"].startswith("/"): | |
desc["href"] = base_url + desc["href"] | |
item = { | |
"account_number": number, | |
"account_name": name, | |
"account_info": html_entry.prettify(), | |
} | |
atyp = find_account_type(number) | |
if atyp: | |
item.update({"account_type": atyp}) | |
rows.append(item) | |
o = partial(op, root_type=root_type) | |
reduce(o, rows, full) | |
o = partial(op, root_type=root_type, cutoff=3) | |
reduce(o, rows, simple) | |
with open("co_plan_unico_de_cuentas.json", "w") as write_file: | |
json.dump( | |
{ | |
"country_code": "co", | |
"name": "Colombia PUC", | |
"tree": full, | |
}, | |
write_file, | |
indent=2, | |
) | |
with open("co_plan_unico_de_cuentas_simple.json", "w") as write_file: | |
json.dump( | |
{ | |
"country_code": "co", | |
"name": "Colombia PUC Simple", | |
"tree": simple, | |
}, | |
write_file, | |
indent=2, | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment