Skip to content

Instantly share code, notes, and snippets.

@redgeoff
Created November 19, 2023 23:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save redgeoff/95fa190669e5282b28743616d7eced1d to your computer and use it in GitHub Desktop.
Save redgeoff/95fa190669e5282b28743616d7eced1d to your computer and use it in GitHub Desktop.
LinkedIn Profile Extractor
from bs4 import BeautifulSoup
import re
import csv
# Load the uploaded HTML file
html_file_path = '/mnt/data/lease-abstractions.html'
# Reload the HTML content
with open(html_file_path, 'r', encoding='utf-8') as file:
html_content = file.read()
# Function to sanitize HTML content
def sanitize_html(content):
# Remove specific pattern \x3C!---->
content = re.sub(r'\\x3C!---+>', '', content)
# Remove newlines, tabs, and any leading/trailing whitespace
content = re.sub(r'[\n\t\r]+', ' ', content).strip()
return content
# Sanitize the HTML content
sanitized_html = sanitize_html(html_content)
# Define the function to extract profiles using the sanitized HTML
def extract_profiles(html):
soup = BeautifulSoup(html, 'html.parser')
profiles = []
for entity in soup.find_all("div", class_="entity-result__item"):
profile = {}
# Extract LinkedIn URL without query parameters and strip whitespace
link_tag = entity.find("a", class_="app-aware-link")
if link_tag and 'href' in link_tag.attrs:
profile["LinkedIn URL"] = link_tag['href'].split('?')[0].strip()
# Extract Name and strip whitespace
name_tag = entity.find("img", class_="presence-entity__image")
if name_tag and 'alt' in name_tag.attrs:
profile["Name"] = name_tag['alt'].strip()
# Extract Title and Company and strip whitespace
subtitle_tag = entity.find("div", class_="entity-result__primary-subtitle")
if subtitle_tag:
subtitle_text = subtitle_tag.get_text(" ", strip=True).strip()
subtitle_text = ' '.join(subtitle_text.split())
if ' at ' in subtitle_text:
title, company = subtitle_text.split(' at ', 1)
profile["Title"] = title.strip()
profile["Company"] = company.strip()
else:
profile["Title"] = subtitle_text.strip()
profile["Company"] = ''
profiles.append(profile)
return profiles
# Extract profiles from the sanitized HTML content
extracted_profiles = extract_profiles(sanitized_html)
# Output file path for the CSV
output_csv_path = '/mnt/data/extracted_profiles.csv'
# Writing the extracted data to a CSV file
with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['Name', 'LinkedIn URL', 'Title', 'Company']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for profile in extracted_profiles:
# Ensure any remaining newline characters are removed
for key, value in profile.items():
profile[key] = value.replace('\\n', ' ').strip()
writer.writerow(profile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment