Skip to content

Instantly share code, notes, and snippets.

@blaggacao
Last active June 24, 2023 16:35
Show Gist options
  • Save blaggacao/d45a454d27556f41fef88833937088f1 to your computer and use it in GitHub Desktop.
Save blaggacao/d45a454d27556f41fef88833937088f1 to your computer and use it in GitHub Desktop.
Scrape Colombian CoA for ERPNext
#!/usr/bin/env python
"""
This script scrapes the colombian chart of accounts in two variants from https://puc.com.co
https://puc.com.co is the most trusted online resource for CoA in Colombia.
Note on Account Types:
account types are mapped on the longest prefix in the 'account_types' dictionary below,
please modify as updates become necessary and rescrape the chart of accounts.
Usage: scrape_co_chart_of_accounts.py
Executions of this script takes aprox. 1 hour
"""
import csv
import json
import random
import time
from functools import partial, reduce
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"
}
"""
These don't have a correspendence in the PUC:
Expenses Included In Asset Valuation
Expenses Included In Valuation
Asset Received But Not Billed
Service Received But Not Billed
Stock Received But Not Billed
Chargeable
Stock Adjustment
Temporary
Docs:
https://docs.erpnext.com/docs/v14/user/manual/en/accounts/chart-of-accounts#24-other-account-types
"""
account_types = {
"1105": "Cash",
"1110": "Bank",
"13": "Receivable",
"14": "Stock",
"1508": "Capital Work in Progress",
"1512": "Capital Work in Progress",
"1588": "Capital Work in Progress",
"1592": "Accumulated Depreciation",
"1597": "Accumulated Depreciation",
"15": "Fixed Asset",
"1699": "Accumulated Depreciation",
"1798": "Accumulated Depreciation",
"22": "Payable",
"23": "Payable",
"24": "Tax",
"3": "Equity",
"4": "Income Account",
"429581": "Round Off",
"5160": "Depreciation",
"5165": "Depreciation",
"5260": "Depreciation",
"5265": "Depreciation",
"54": "Tax",
"5": "Expense Account",
"6": "Cost of Goods Sold",
"7": "Cost of Goods Sold",
}
def op(acc, elem, root_type, cutoff=float("inf")):
new = elem.copy()
number = new["account_number"]
name = new["account_name"]
del new["account_name"]
def ischildof(i):
return (
isinstance(i, dict)
and i.get("account_number") is not None
and number.startswith(i["account_number"])
)
def recurse(racc, depth=1):
if depth > cutoff:
racc.update({"is_group": 1})
return racc
for n, c in racc.items():
if not ischildof(c):
continue
racc[n] = recurse(c, depth + 1)
return racc
racc[name] = new
if depth == 1:
racc[name]["root_type"] = root_type
return racc
acc.update(recurse(acc))
return acc
def find_account_type(number):
acctypes = [(num, type) for (num, type) in account_types.items() if number.startswith(num)]
acctypes = sorted(acctypes, key=lambda i: i[0])
return acctypes[0][1] if acctypes != [] else None
base_url = "https://puc.com.co"
full = {}
simple = {}
for (cl, root_type) in [
(1, "Asset"),
(2, "Liability"),
(3, "Equity"),
(4, "Income"),
(5, "Expense"),
(6, "Expense"),
(7, "Expense"),
(8, "Asset"),
(9, "Liability"),
]:
rows = []
time.sleep(5)
url = f"{base_url}/cuentas/clase/{cl}"
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
accounts_list = soup.find_all("ul", class_="account-list")[1]
account_links = accounts_list.find_all("a")
rows = []
for link in account_links:
(number, _, name) = link.text.strip().partition(" ")
acc_url = base_url + link["href"]
print(f"querying {acc_url}")
time.sleep(1 / random.randint(2, 15))
acc_page = requests.get(acc_url, headers=headers)
acc_soup = BeautifulSoup(acc_page.content, "html.parser")
html_entry = acc_soup.find("div", class_="entry")
for desc in list(html_entry.descendants):
if desc.name == "aside":
desc.decompose()
if desc.name == "a" and desc["href"].startswith("/"):
desc["href"] = base_url + desc["href"]
item = {
"account_number": number,
"account_name": name,
"account_info": html_entry.prettify(),
}
atyp = find_account_type(number)
if atyp:
item.update({"account_type": atyp})
rows.append(item)
o = partial(op, root_type=root_type)
reduce(o, rows, full)
o = partial(op, root_type=root_type, cutoff=3)
reduce(o, rows, simple)
with open("co_plan_unico_de_cuentas.json", "w") as write_file:
json.dump(
{
"country_code": "co",
"name": "Colombia PUC",
"tree": full,
},
write_file,
indent=2,
)
with open("co_plan_unico_de_cuentas_simple.json", "w") as write_file:
json.dump(
{
"country_code": "co",
"name": "Colombia PUC Simple",
"tree": simple,
},
write_file,
indent=2,
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment