blaggacao/scrape_co_chart_of_accounts.py

## scrape_co_chart_of_accounts.py
#!/usr/bin/env python
"""
This script scrapes the colombian chart of accounts in two variants from https://puc.com.co

https://puc.com.co is the most trusted online resource for CoA in Colombia.

Note on Account Types:
    account types are mapped on the longest prefix in the 'account_types' dictionary below,
    please modify as updates become necessary and rescrape the chart of accounts.

Usage: scrape_co_chart_of_accounts.py

Executions of this script takes aprox. 1 hour
"""

import csv
import json
import random
import time
from functools import partial, reduce

import requests
from bs4 import BeautifulSoup

headers = {
	"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"
}

"""
These don't have a correspendence in the PUC:

Expenses Included In Asset Valuation
Expenses Included In Valuation
Asset Received But Not Billed
Service Received But Not Billed
Stock Received But Not Billed
Chargeable
Stock Adjustment
Temporary

Docs:
https://docs.erpnext.com/docs/v14/user/manual/en/accounts/chart-of-accounts#24-other-account-types
"""
account_types = {
	"1105": "Cash",
	"1110": "Bank",
	"13": "Receivable",
	"14": "Stock",
	"1508": "Capital Work in Progress",
	"1512": "Capital Work in Progress",
	"1588": "Capital Work in Progress",
	"1592": "Accumulated Depreciation",
	"1597": "Accumulated Depreciation",
	"15": "Fixed Asset",
	"1699": "Accumulated Depreciation",
	"1798": "Accumulated Depreciation",
	"22": "Payable",
	"23": "Payable",
	"24": "Tax",
	"3": "Equity",
	"4": "Income Account",
	"429581": "Round Off",
	"5160": "Depreciation",
	"5165": "Depreciation",
	"5260": "Depreciation",
	"5265": "Depreciation",
	"54": "Tax",
	"5": "Expense Account",
	"6": "Cost of Goods Sold",
	"7": "Cost of Goods Sold",
}


def op(acc, elem, root_type, cutoff=float("inf")):
	new = elem.copy()
	number = new["account_number"]
	name = new["account_name"]
	del new["account_name"]

	def ischildof(i):
		return (
			isinstance(i, dict)
			and i.get("account_number") is not None
			and number.startswith(i["account_number"])
		)

	def recurse(racc, depth=1):
		if depth > cutoff:
			racc.update({"is_group": 1})
			return racc
		for n, c in racc.items():
			if not ischildof(c):
				continue
			racc[n] = recurse(c, depth + 1)
			return racc
		racc[name] = new
		if depth == 1:
			racc[name]["root_type"] = root_type
		return racc

	acc.update(recurse(acc))
	return acc


def find_account_type(number):
	acctypes = [(num, type) for (num, type) in account_types.items() if number.startswith(num)]
	acctypes = sorted(acctypes, key=lambda i: i[0])
	return acctypes[0][1] if acctypes != [] else None


base_url = "https://puc.com.co"
full = {}
simple = {}


for (cl, root_type) in [
	(1, "Asset"),
	(2, "Liability"),
	(3, "Equity"),
	(4, "Income"),
	(5, "Expense"),
	(6, "Expense"),
	(7, "Expense"),
	(8, "Asset"),
	(9, "Liability"),
]:
	rows = []
	time.sleep(5)
	url = f"{base_url}/cuentas/clase/{cl}"
	page = requests.get(url, headers=headers)
	soup = BeautifulSoup(page.content, "html.parser")
	accounts_list = soup.find_all("ul", class_="account-list")[1]
	account_links = accounts_list.find_all("a")
	rows = []
	for link in account_links:
		(number, _, name) = link.text.strip().partition(" ")
		acc_url = base_url + link["href"]
		print(f"querying {acc_url}")
		time.sleep(1 / random.randint(2, 15))
		acc_page = requests.get(acc_url, headers=headers)
		acc_soup = BeautifulSoup(acc_page.content, "html.parser")
		html_entry = acc_soup.find("div", class_="entry")
		for desc in list(html_entry.descendants):
			if desc.name == "aside":
				desc.decompose()
			if desc.name == "a" and desc["href"].startswith("/"):
				desc["href"] = base_url + desc["href"]
		item = {
			"account_number": number,
			"account_name": name,
			"account_info": html_entry.prettify(),
		}
		atyp = find_account_type(number)
		if atyp:
			item.update({"account_type": atyp})
		rows.append(item)

	o = partial(op, root_type=root_type)
	reduce(o, rows, full)
	o = partial(op, root_type=root_type, cutoff=3)
	reduce(o, rows, simple)

with open("co_plan_unico_de_cuentas.json", "w") as write_file:
	json.dump(
		{
			"country_code": "co",
			"name": "Colombia PUC",
			"tree": full,
		},
		write_file,
		indent=2,
	)
with open("co_plan_unico_de_cuentas_simple.json", "w") as write_file:
	json.dump(
		{
			"country_code": "co",
			"name": "Colombia PUC Simple",
			"tree": simple,
		},
		write_file,
		indent=2,
	)
	#!/usr/bin/env python
	"""
	This script scrapes the colombian chart of accounts in two variants from https://puc.com.co

	https://puc.com.co is the most trusted online resource for CoA in Colombia.

	Note on Account Types:
	account types are mapped on the longest prefix in the 'account_types' dictionary below,
	please modify as updates become necessary and rescrape the chart of accounts.

	Usage: scrape_co_chart_of_accounts.py

	Executions of this script takes aprox. 1 hour
	"""

	import csv
	import json
	import random
	import time
	from functools import partial, reduce

	import requests
	from bs4 import BeautifulSoup

	headers = {
	"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"
	}

	"""
	These don't have a correspendence in the PUC:

	Expenses Included In Asset Valuation
	Expenses Included In Valuation
	Asset Received But Not Billed
	Service Received But Not Billed
	Stock Received But Not Billed
	Chargeable
	Stock Adjustment
	Temporary

	Docs:
	https://docs.erpnext.com/docs/v14/user/manual/en/accounts/chart-of-accounts#24-other-account-types
	"""
	account_types = {
	"1105": "Cash",
	"1110": "Bank",
	"13": "Receivable",
	"14": "Stock",
	"1508": "Capital Work in Progress",
	"1512": "Capital Work in Progress",
	"1588": "Capital Work in Progress",
	"1592": "Accumulated Depreciation",
	"1597": "Accumulated Depreciation",
	"15": "Fixed Asset",
	"1699": "Accumulated Depreciation",
	"1798": "Accumulated Depreciation",
	"22": "Payable",
	"23": "Payable",
	"24": "Tax",
	"3": "Equity",
	"4": "Income Account",
	"429581": "Round Off",
	"5160": "Depreciation",
	"5165": "Depreciation",
	"5260": "Depreciation",
	"5265": "Depreciation",
	"54": "Tax",
	"5": "Expense Account",
	"6": "Cost of Goods Sold",
	"7": "Cost of Goods Sold",
	}


	def op(acc, elem, root_type, cutoff=float("inf")):
	new = elem.copy()
	number = new["account_number"]
	name = new["account_name"]
	del new["account_name"]

	def ischildof(i):
	return (
	isinstance(i, dict)
	and i.get("account_number") is not None
	and number.startswith(i["account_number"])
	)

	def recurse(racc, depth=1):
	if depth > cutoff:
	racc.update({"is_group": 1})
	return racc
	for n, c in racc.items():
	if not ischildof(c):
	continue
	racc[n] = recurse(c, depth + 1)
	return racc
	racc[name] = new
	if depth == 1:
	racc[name]["root_type"] = root_type
	return racc

	acc.update(recurse(acc))
	return acc


	def find_account_type(number):
	acctypes = [(num, type) for (num, type) in account_types.items() if number.startswith(num)]
	acctypes = sorted(acctypes, key=lambda i: i[0])
	return acctypes[0][1] if acctypes != [] else None


	base_url = "https://puc.com.co"
	full = {}
	simple = {}


	for (cl, root_type) in [
	(1, "Asset"),
	(2, "Liability"),
	(3, "Equity"),
	(4, "Income"),
	(5, "Expense"),
	(6, "Expense"),
	(7, "Expense"),
	(8, "Asset"),
	(9, "Liability"),
	]:
	rows = []
	time.sleep(5)
	url = f"{base_url}/cuentas/clase/{cl}"
	page = requests.get(url, headers=headers)
	soup = BeautifulSoup(page.content, "html.parser")
	accounts_list = soup.find_all("ul", class_="account-list")[1]
	account_links = accounts_list.find_all("a")
	rows = []
	for link in account_links:
	(number, _, name) = link.text.strip().partition(" ")
	acc_url = base_url + link["href"]
	print(f"querying {acc_url}")
	time.sleep(1 / random.randint(2, 15))
	acc_page = requests.get(acc_url, headers=headers)
	acc_soup = BeautifulSoup(acc_page.content, "html.parser")
	html_entry = acc_soup.find("div", class_="entry")
	for desc in list(html_entry.descendants):
	if desc.name == "aside":
	desc.decompose()
	if desc.name == "a" and desc["href"].startswith("/"):
	desc["href"] = base_url + desc["href"]
	item = {
	"account_number": number,
	"account_name": name,
	"account_info": html_entry.prettify(),
	}
	atyp = find_account_type(number)
	if atyp:
	item.update({"account_type": atyp})
	rows.append(item)

	o = partial(op, root_type=root_type)
	reduce(o, rows, full)
	o = partial(op, root_type=root_type, cutoff=3)
	reduce(o, rows, simple)

	with open("co_plan_unico_de_cuentas.json", "w") as write_file:
	json.dump(
	{
	"country_code": "co",
	"name": "Colombia PUC",
	"tree": full,
	},
	write_file,
	indent=2,
	)
	with open("co_plan_unico_de_cuentas_simple.json", "w") as write_file:
	json.dump(
	{
	"country_code": "co",
	"name": "Colombia PUC Simple",
	"tree": simple,
	},
	write_file,
	indent=2,
	)