Last active
April 25, 2023 04:46
-
-
Save 33sticks/e033dee49a061101078c to your computer and use it in GitHub Desktop.
A Python script that takes a CSV file containing URLs, say from a web server log or an analytics platform, and parsing the URLs into URL parts that are then appended to a data frame for further manipulation and/or data analysis.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#import the urlprase library to break url into components | |
from urlparse import urlparse | |
#import pandas for data processing | |
from pandas import DataFrame, Series | |
import pandas as pd | |
#import URL data | |
df = pd.read_csv("/Users/Documents/analysis/my_urls.csv") | |
#append URL parts to the data frame for each URL | |
df['domain'] = '' | |
df['protocol'] = '' | |
df['domain'] = '' | |
df['path'] = '' | |
df['query'] = '' | |
df['fragment'] = '' | |
unique_urls = df.url.unique() | |
l = len(unique_urls) | |
i=0 | |
for url in unique_urls: | |
i+=1 | |
print "\r%d / %d" %(i, l), | |
split = urlparse(url) | |
row_index = df.url == url | |
df.loc[row_index, 'protocol'] = split.scheme | |
df.loc[row_index, 'domain'] = split.netloc | |
df.loc[row_index, 'path'] = split.path | |
df.loc[row_index, 'query'] = split.query | |
df.loc[row_index, 'fragment'] = split.fragment | |
#extract list of unique domains from the data frame | |
domains = pd.unique(df.domain.ravel()) | |
#export domains to a csv file | |
numpy.savetxt( | |
'unique_domains.csv', # file name | |
domains, # array to save | |
delimiter=',', # column delimiter | |
fmt='%s', | |
newline='\n', # new line character | |
footer='end of file', # file footer | |
comments='# ', # character to use for comments | |
header='Unique Domains by numpy') # file header |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment