Created
March 10, 2021 18:01
-
-
Save kwcooper/b211777670d9971a45d4ca8f4785b884 to your computer and use it in GitHub Desktop.
Quick script to extract links from raw text, then clean them
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import pandas as pd | |
# kwc 210310 | |
data_fName = 'linkGrabberData.txt' | |
saveName = 'linkedinLinks.csv' | |
re_exp = '(((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)' | |
# open the raw data file and find links with regex | |
with open(fName) as f: | |
urls = f.read() | |
#print(urls) | |
links = re.findall(re_exp, urls) | |
# Add custom curation code here to further | |
# pipeline process the links as needed | |
good_links = [] | |
for url in links: | |
if 'linkedin' in url[0]: | |
url1 = url[0].strip('//www.') | |
url2 = url1.strip(')') | |
good_links.append(url2) | |
# Convert to a dataframe for easy manipulation / exporting | |
df = pd.DataFrame(good_links) | |
df.to_csv(saveName) | |
# Old expressions I've used | |
# re_exp = '[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)' | |
# re_exp = '((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*' | |
# re_exp = '((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment