Skip to content

Instantly share code, notes, and snippets.

@kwcooper
Created March 10, 2021 18:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kwcooper/b211777670d9971a45d4ca8f4785b884 to your computer and use it in GitHub Desktop.
Save kwcooper/b211777670d9971a45d4ca8f4785b884 to your computer and use it in GitHub Desktop.
Quick script to extract links from raw text, then clean them
import re
import pandas as pd
# kwc 210310
data_fName = 'linkGrabberData.txt'
saveName = 'linkedinLinks.csv'
re_exp = '(((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)'
# open the raw data file and find links with regex
with open(fName) as f:
urls = f.read()
#print(urls)
links = re.findall(re_exp, urls)
# Add custom curation code here to further
# pipeline process the links as needed
good_links = []
for url in links:
if 'linkedin' in url[0]:
url1 = url[0].strip('//www.')
url2 = url1.strip(')')
good_links.append(url2)
# Convert to a dataframe for easy manipulation / exporting
df = pd.DataFrame(good_links)
df.to_csv(saveName)
# Old expressions I've used
# re_exp = '[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
# re_exp = '((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*'
# re_exp = '((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment