Skip to content

Instantly share code, notes, and snippets.

@ershovio
Last active January 31, 2024 11:55
Show Gist options
  • Save ershovio/1fb20cb2761f51b26e99a55043cf371f to your computer and use it in GitHub Desktop.
Save ershovio/1fb20cb2761f51b26e99a55043cf371f to your computer and use it in GitHub Desktop.
Parser to create Pandas Dataframe from HTML pages from crunchbase.com
import os
from datetime import datetime
import pandas as pd
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
def parse_crunchbase_html(soup: BeautifulSoup) -> pd.DataFrame:
"""
Parse html page with search results from crunchbase.com
"""
# Get data for each company
companies_data = soup.find_all("grid-row", {"class": "ng-star-inserted"})
# Get all column ids
grid_cells = companies_data[0].find_all("grid-cell", {"class": lambda x: x and x.startswith("column-id")})
classes = [g["class"] for g in grid_cells]
classes = [item for sublist in classes for item in sublist]
column_ids = list(set([c for c in classes if c.startswith("column-id")]))
# Get column names
header = soup.find("grid-header")
column_ids_to_names = {}
for c in column_ids:
try:
column_ids_to_names[c] = header.find("grid-column-header", {"class": c}).text.strip()
except Exception:
continue
# Parse data for each company
name_column = column_ids_to_names["column-id-identifier"]
parsed_data = []
for company_data in companies_data:
# Get company name
company_name = company_data.find("div", {"class": "identifier-label"}).text.strip()
# Get data for each column
parsed_company_data = {name_column: company_name}
for cid, cname in column_ids_to_names.items():
parsed_company_data[cname] = company_data.find("grid-cell", {"class": cid}).text.strip()
parsed_data.append(parsed_company_data)
# Create dataframe
return pd.DataFrame(parsed_data)
def parse_crunchbase_html_files(folder_name: str) -> pd.DataFrame:
"""
Parse all html files in a folder
"""
abs_path = os.path.abspath(folder_name)
html_filenames = [os.path.join(abs_path, f) for f in os.listdir(folder_name) if f.endswith(".html")]
dfs = []
for html_filename in tqdm(html_filenames):
with open(html_filename, "r") as f:
soup = BeautifulSoup(f, "html.parser")
df = parse_crunchbase_html(soup)
dfs.append(df)
return pd.concat(dfs)
def enrich_crunchbase_df(df: pd.DataFrame) -> pd.DataFrame:
"""
Drop duplicates, parse several columns and remove columns with empty values
"""
edf = df.copy(deep=True)
edf["Founded Date parsed"] = edf["Founded Date"].apply(_parse_date_column)
edf["Founded Year"] = edf["Founded Date parsed"].apply(lambda x: int(x.year) if x else None)
edf["Max revenue in millions"] = edf["Estimated Revenue Range"].apply(_get_max_revenue_in_milions)
edf["Max number of employees"] = edf["Number of Employees"].apply(_get_max_number_of_employees)
edf["Headquarter county"] = edf["Headquarters Location"].apply(lambda x: x.split(",")[-1].strip())
edf = edf.drop(["Facebook", "Twitter", "LinkedIn"], axis=1)
edf = edf.drop_duplicates(subset=["Organization Name"], keep="first")
return edf
def prepare_df_for_excel_export(df: pd.DataFrame) -> pd.DataFrame:
"""
Fix illegal characters in a dataframe to be exported to excel
"""
return df.applymap(
lambda x: x.replace("\r", " ").replace("\n", " ").replace("\t", " ").replace("\v", " ").replace("\x03", " ")
if isinstance(x, str)
else x
)
def _parse_date_column(date_str: str):
"""
Parse a date string to a datetime object
"""
for date_format in ["%Y", "%b %d, %Y", "%d %b %Y", "%b %Y"]:
try:
return datetime.strptime(date_str, date_format).date()
except ValueError:
continue
return None
def _get_max_revenue_in_milions(revenue_range: str):
"""
Get a max number from a revenue range string in millions.
Example: '$10M to $50M' to to 50. '$500M to $1B' to 1000 $10b+ to 10000
"""
parsed_revenue_range = revenue_range.replace("$", "").replace("M", "").replace("B", "000").replace("+", "")
parsed_revenue_range = parsed_revenue_range.split(" to ")
return int(parsed_revenue_range[-1])
def _get_max_number_of_employees(employee_range: str) -> int | None:
"""
Translates an employee range string to the maximum number.
Example: '1-10' translates to 10. '10001+' translates to 10001.
"""
if employee_range == "—":
return None
else:
parsed_employee_range = employee_range.replace("+", "").split("-")
return int(parsed_employee_range[-1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment